In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [84]:
data = pd.read_csv("titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
drop_cols = ["Embarked","Cabin","Name","Ticket","PassengerId"]
data = data.drop(drop_cols,axis=1)

In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [87]:
data = data.fillna(data["Age"].mean())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [88]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["Sex"] = le.fit_transform(data["Sex"])
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [89]:
input_cols = ["Pclass","Sex","Age","SibSp","Parch","Fare"]
output_cols = ["Survived"]

X = data[input_cols]
Y = data[output_cols]
print(X.shape,Y.shape)

(891, 6) (891, 1)


In [90]:
def entropy(cols):
    cnts = np.unique(cols,return_counts=True)
    m = float(cols.shape[0])

    entropy = 0.0

    for ix in cnts[1]:
        p = ix/m
        entropy += -1.0*p*np.log2(p)

    return entropy

In [91]:
def divideCols(data,fkey,fval):
    left = pd.DataFrame([],columns = data.columns)
    right = pd.DataFrame([],columns = data.columns)
    
    for i in range(data.shape[0]):
        if data[fkey].loc[i]>fval:
            right = right.append(data.loc[i])
        else:
            left = left.append(data.loc[i])
    
    return left,right

In [92]:
def information_gain(data,fkey,fval):
    left,right = divideCols(data,fkey,fval)

    l = float(left.shape[0])/float(data.shape[0])
    r = float(right.shape[0])/float(data.shape[0])

    if l==0 or r==0:
        return -1000

    i_gain = entropy(data.Survived) - (l*entropy(left.Survived) + r*entropy(right.Survived))

    return i_gain

In [93]:
class DecisionTree:

    def __init__(self,max_depth=5,depth=0):
        self.left = None
        self.right = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        self.fkey = None
        self.fval = None

    def train(self,data):
        features = ["Pclass","Sex","Age","SibSp","Parch","Fare"]
        info_gain = []
        
        for ix in features:
            i_gain = information_gain(data,ix,data[ix].mean())
            info_gain.append(i_gain)

        self.fkey = features[np.argmax(info_gain)]
        self.fval = data[self.fkey].mean()
        print("Making Tree Features is",self.fkey)

        data_left,data_right = divideCols(data,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        l = data_left.shape[0]
        r = data_right.shape[0]

        if l==0 or r==0:
            if data.Survived.mean()>=0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return

        if self.depth>=self.max_depth:
            if data.Survived.mean()>=0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return

        
        self.left = DecisionTree(max_depth=self.max_depth,depth=self.depth+1)
        self.left.train(data_left)

        self.right = DecisionTree(max_depth=self.max_depth,depth=self.depth+1)
        self.right.train(data_right)

        if data.Survived.mean()>=0.5:
            self.target = 'Survived'
        else:
            self.target = 'Dead'
        return

    def predict(self,x_test):
        if x_test[self.fkey]>self.fval:
            if self.right==None:
                return self.target
            return self.right.predict(x_test)
        else:
            if self.left==None:
                return self.target
            return self.left.predict(x_test)   




In [94]:
split = int(0.7*data.shape[0])
train_data = data[:split]
test_data = data[split:]
test_data = test_data.reset_index(drop=True)

print(train_data.shape,test_data.shape)

(623, 7) (268, 7)


In [95]:
d = DecisionTree()
d.train(train_data)

Making Tree Features is Sex
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Parch
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Parch
Making Tre

In [96]:
pred = []

for ix in range(test_data.shape[0]):
    pred.append(d.predict(test_data.loc[ix]))

In [97]:
print(pred)

['Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Survived', 'Dead', 'Survived', 'Survived', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Survived', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Survived', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Survived', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Survived', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survived', 'Survived', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'Dead', 'Survived', 'Dead', 'Survived', 'Survived', 'Dead', 'Dead', 'Survived', 'Dead', 'Dead', 'De

In [98]:
Y_train = np.array(test_data.Survived)
print(Y_train.shape)

(268,)


In [99]:
for i in range(len(pred)):
    if pred[i]=='Survived':
        pred[i] = 1
    else:
        pred[i] = 0

print(pred)
        

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [100]:

acc = np.sum(pred==test_data.Survived)/test_data.shape[0]
print(acc)

0.8171641791044776
