In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('./Titanic.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
cols_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']

In [5]:
data = data.drop(cols_drop, axis = 1)

In [6]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
lab_enc = LabelEncoder()
data['Sex'] = lab_enc.fit_transform(data['Sex'])
data['Embarked'] = lab_enc.fit_transform(data['Embarked'])
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [8]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [9]:
data = data.fillna(data['Age'].mean())

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [11]:
input_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
output_col = ['Survived']

X_data = data[input_cols]
Y_data = data['Survived']
X_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [12]:
print(X_data.shape, Y_data.shape)

(891, 7) (891,)


In [13]:
class DecisionTree:
    def __init__(self,max_depth = 10,min_split = 2):
        self.max_depth = max_depth
        self.min_split = min_split
        
    def fit(self,X,y):
        self.labels = np.unique(y)
        self.root = self.constructNode(X,y,0)
        # return self.root

    def traverse_predict(self,t,node):
        if node['isLeaf'] == True:
            return node['Predictions']
        else:
            if t[node['column']] < node['threshold']:
                return self.traverse_predict(t,node['children']['Left'])
            else:
                return self.traverse_predict(t,node['children']['Right'])

    def predict(self,t):
        predictions = []
        for i in range(len(t)):
            pred = self.traverse_predict(t.iloc[i],self.root)
            predictions.append(self.labels[np.argmax(pred)])
            
        return np.array(predictions)
    
    def constructNode(self,X,y,cur_depth):
        if len(X) == 0 or cur_depth > self.max_depth:
            return None

        if len(X) >= self.min_split:
            parentEntropy = self.entropy(y)
            weightedChildrenEntropy = np.zeros((X.shape[1],))
    
            for feat in range(X.shape[1]):
                th = X.iloc[:,feat].mean()
                weightedChildrenEntropy[feat] += (X.iloc[:,feat] < th).mean()*self.entropy(y[X.iloc[:,feat] < th])
                weightedChildrenEntropy[feat] += (X.iloc[:,feat] >= th).mean()*self.entropy(y[X.iloc[:,feat] >= th])
    
            feat = np.argmax(parentEntropy - weightedChildrenEntropy)
            th = X.iloc[:,feat].mean()
            node = {'column':X.columns[feat], 
                    'threshold':th,
                    'children':{},
                    'isLeaf':False
                   }
                    
            LX_split = X[X.iloc[:,feat] < th]
            Ly_split = y[X.iloc[:,feat] < th]
            Lchild_node = self.constructNode(LX_split,Ly_split,cur_depth+1)
            
            RX_split = X[X.iloc[:,feat] >= th]
            Ry_split = y[X.iloc[:,feat] >= th]
            Rchild_node = self.constructNode(RX_split,Ry_split,cur_depth+1)

            if Lchild_node == None or Rchild_node == None:
                node['isLeaf'] = True
            else:
                node['children']['Left'] = Lchild_node
                node['children']['Right'] = Rchild_node

        else:
            node = {'column':None, 
                    'children':{},
                    'isLeaf':True
                   }

        if node['isLeaf']:
            node['Predictions'] = self.predictions(y)

        return node

    def predictions(self,y):
        prob = np.zeros(self.labels.shape,dtype=np.float32)
        for i,cls in enumerate(self.labels):
            prob[i] = (y==cls).mean()

        return prob
            

    def entropy(self,y):
        classes,counts = np.unique(y,return_counts=True)
        prob = counts/len(y)
    
        return -1*np.sum(prob*np.log2(prob))

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [15]:
x_train,x_test,y_train,y_test = train_test_split(X_data,Y_data,train_size = 0.8)

In [16]:
dt = DecisionTree(max_depth=7)
# dt = DecisionTreeClassifier(max_depth=7)
# rf = RandomForestClassifier(1000,max_depth=7)
dt.fit(x_train,y_train)
dt.root

{'column': 'Sex',
 'threshold': np.float64(0.6474719101123596),
 'children': {'Left': {'column': 'Pclass',
   'threshold': np.float64(2.135458167330677),
   'children': {'Left': {'column': 'SibSp',
     'threshold': np.float64(0.5539568345323741),
     'children': {'Left': {'column': 'Embarked',
       'threshold': np.float64(1.452054794520548),
       'children': {'Left': {'column': 'Age',
         'threshold': np.float64(34.34995989304812),
         'children': {'Left': {'column': 'Pclass',
           'threshold': np.float64(1.3333333333333333),
           'children': {'Left': {'column': 'Pclass',
             'threshold': np.float64(1.0),
             'children': {},
             'isLeaf': True,
             'Predictions': array([0., 1.], dtype=float32)},
            'Right': {'column': 'Pclass',
             'threshold': np.float64(2.0),
             'children': {},
             'isLeaf': True,
             'Predictions': array([0., 1.], dtype=float32)}},
           'isLeaf': False

In [83]:
accuracy_score(y_train,rf.predict(x_train))

0.8960674157303371

In [84]:
accuracy_score(y_test,rf.predict(x_test))

0.8435754189944135