In [None]:
import numpy as np
import pandas as pd
from pprint import pprint
data = pd.read_csv('titanic.csv')

In [None]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"
    
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Adult/Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult


In [None]:
data.groupby(['Sex'])['Survived'].sum()

Sex
female    233
male      109
Name: Survived, dtype: int64

In [None]:
trainingData=data[["Pclass","Adult/Child","Sex","Survived"]]
trainingData.head()

Unnamed: 0,Pclass,Adult/Child,Sex,Survived
0,3,Adult,male,0
1,1,Adult,female,1
2,3,Adult,female,1
3,1,Adult,female,1
4,3,Adult,male,0


In [None]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Pclass","Adult/Child","Sex","Survived"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Sex","Survived"]]=catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Pclass,Adult/Child,Sex,Survived
0,2,0,1,0
1,0,0,0,1
2,2,0,0,1
3,0,0,0,1
4,2,0,1,0


In [None]:
len(trainingData)

891

In [None]:
trainingData = trainingData.dropna()
len(trainingData)

891

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)

In [None]:
train.head()

Unnamed: 0,Pclass,Adult/Child,Sex,Survived
57,2,0,1,0
623,2,0,1,0
373,0,0,1,0
526,1,0,0,1
584,2,1,1,0


In [None]:
test.head()

Unnamed: 0,Pclass,Adult/Child,Sex,Survived
409,2,1,0,0
494,2,0,1,0
664,2,0,1,1
210,2,0,1,0
502,2,1,0,0


In [None]:
len(train)

712

In [None]:
len(test)

179

In [None]:
def entropy(col):
    
    elements,freq = np.unique(col,return_counts = True)
    entropy = np.sum([(-freq[i]/np.sum(freq))*np.log2(freq[i]/np.sum(freq)) for i in range(len(elements))])
    return entropy

In [None]:
def InfoGain(data,split_attribute_name,target_name="Survived"):
    """
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """    
    #Calculate the entropy of the total dataset
    parent_entropy = entropy(data[target_name])
    
    ##Calculate the entropy of the dataset
    
    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    #Calculate the information gain
    Information_Gain = parent_entropy - Weighted_Entropy
    return Information_Gain

In [None]:
def ID3(data,originaldata,features,target_attribute_name="Survived",parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    elif len(data)==0:
        ret = np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
        return ret
    
    elif len(features) ==0:
        return parent_node_class  
    
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
    tree = {best_feature:{}}
    features = [i for i in features if i != best_feature]
    
    for value in np.unique(data[best_feature]):
        value = value
        sub_data = data.where(data[best_feature] == value).dropna()
        subtree = ID3(sub_data,data,features,target_attribute_name,parent_node_class)
        tree[best_feature][value] = subtree
        
    return(tree)

In [None]:
tree = ID3(train,train,train.columns[:-1])
pprint(tree)

{'Sex': {0: {'Pclass': {0.0: {'Adult/Child': {0.0: 1.0, 1.0: 1.0}},
                        1.0: {'Adult/Child': {0.0: 1.0, 1.0: 1.0}},
                        2.0: {'Adult/Child': {0.0: 1.0, 1.0: 1.0}}}},
         1: {'Pclass': {0.0: {'Adult/Child': {0.0: 0.0, 1.0: 0.0}},
                        1.0: {'Adult/Child': {0.0: 0.0, 1.0: 0.0}},
                        2.0: {'Adult/Child': {0.0: 0.0, 1.0: 0.0}}}}}}
