## Implementation of Decision Tree Classifier Code by our own
###code written by Uday kiran Bakka

In [307]:
#importing required modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report

In [317]:
#importing dataset
data=pd.read_csv("titanic_x_y_train.csv",delimiter=",")
data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [318]:
##data cleaning if you know cleaning data then don't worry about it****
data.drop(["Name","Ticket","Cabin"],axis=1,inplace=True)
def change(st):
    if st=="female":
        return 0
    else:
        return 1
data["Sex"]=data["Sex"].apply(change)
data.drop("Embarked",axis=1,inplace=True)
data.Pclass.fillna(data.Pclass.mean(),inplace=True)
data.Age.fillna(data.Age.mean(),inplace=True)
data.SibSp.fillna(data.SibSp.mean(),inplace=True)
data.Parch.fillna(data.Parch.mean(),inplace=True)
data.Fare.fillna(data.Fare.mean(),inplace=True)
data.Survived.fillna(data.Survived.mean(),inplace=True)
y_train=data["Survived"]
data.drop(["Survived","Age","Fare"],axis=1,inplace=True)
x_train=data
data

Unnamed: 0,Pclass,Sex,SibSp,Parch
0,2,0,1,0
1,3,1,0,0
2,2,1,0,0
3,3,0,0,4
4,3,1,0,0
...,...,...,...,...
663,2,0,0,0
664,3,1,0,0
665,3,1,0,0
666,3,0,0,0


In [319]:
print(x_train[:10])
print(y_train[:10])

   Pclass  Sex  SibSp  Parch
0       2    0      1      0
1       3    1      0      0
2       2    1      0      0
3       3    0      0      4
4       3    1      0      0
5       3    1      0      0
6       1    0      1      0
7       3    1      0      0
8       2    1      0      0
9       3    1      0      0
0    1
1    0
2    0
3    0
4    0
5    0
6    1
7    0
8    1
9    0
Name: Survived, dtype: int64


In [321]:
#Tree class which stores the struct
class Tree:
    def __init__(self):
        self.split_feature=None
        self.split_class=None
        self.true_tree=None
        self.false_tree=None
        self.answer=None
        
# Decision Tree implementation
class DecisonTree:
    def __init__(self):
        #each tree stores different values
        self.tree=None
        
    #funcion will find all the different classes in an array and their count
    def find_counts(self,classes):
        count={}
        for i in classes:
            count[i]=count.get(i,0)+1
        return count
    
    #function will return the gini_values of all the classes passed to it
    def find_gini(self,classes):
        counts=self.find_counts(classes)
        impurity=1
        total=0
        for i in counts:
            total+=counts[i]
        for i in counts:
            probability=counts[i]/float((total))
            impurity-=probability**2
        return impurity
    
    #function will return gini_gain value
    def find_gini_gain(self,gb,ga1,ga2,total,sub_total_1,sub_total_2):
        return (gb)-(sub_total_1/total)*ga1-(sub_total_2/total)*ga2
    
    #function will return the class which gives maximum gini_gain on a particular feature
    def find_best_class(self,x,y,feature):        
        classes=np.unique(x[:,feature])
        best_gini_gain=None
        best_split=None
        for cls in classes:
            yes_count=[]
            no_count=[]
            for i in range(len(x)):
                if x[i,feature]==cls:
                    yes_count.append(y[i])
                else:
                    no_count.append(y[i])
            gini_after_yes=self.find_gini(yes_count)
            gini_after_no=self.find_gini(no_count)
            gini_before=self.find_gini(y)
            
            gini_gain=self.find_gini_gain(gini_before,gini_after_yes,gini_after_no,len(y),len(yes_count),len(no_count))
            if best_gini_gain==None or gini_gain>best_gini_gain:
                
                best_gini_gain=gini_gain
                best_split=cls
        return feature,best_split,best_gini_gain
    
    #fit function to build a tree and store
    def fit(self,x,y):
        self.tree=self.BuildTree(x,y)
        
    #function to build tree and continue splitting on a feature and class which gives maximum gini_gain
    def BuildTree(self,x,y):
        Node=Tree()
        
        #if our data has only one input then return the ouput** our data is pure
        if len(x)==1:
            Node.answer=y[0]
            return Node
        
        #if our data has only one output then return the output**  our data is pure
        elif len(np.unique(y))==1:
            Node.answer=y[0]
            return Node
        
        features=len(x[0])
        split_gini_class=None
        split_gini_feature=None
        split_gini_value=None
        
        #run loop to find the best feature and best class to split
        for feature in range(features):
            gini_feature,gini_class,gini_value=self.find_best_class(x,y,feature)
            if split_gini_value==None or split_gini_value<gini_value:
                        split_gini_class=gini_class
                        split_gini_feature=gini_feature
                        split_gini_value=gini_value
        
        #if our split wont give significant gain then it means there is not need to split data
        #we will usually return the frequently repeated output
        ##TODO ***since this code works only on binary classification ##convert it into multi classification
        if split_gini_value<=0.0:
            if sum(y)/len(y)>0.5:
                Node.answer=1
            else:
                Node.answer=0
            return Node
        
        #printing important data such as split feature, split class, split_gini_value
        print("Split on feature-----------------------",split_gini_feature)
        print("Split on class------------------------",split_gini_class)
        print("Gini value-------------------------",split_gini_value)
        print()
        print("--------------------------------------------------------")
        print()
        
        #true_tree is the tree which obeys the split condition and vice versa
        true_tree=[]
        true_output=[]
        false_tree=[]
        false_output=[]
        
        for i in range(len(x)):
            if x[i,split_gini_feature]==split_gini_class:
                true_tree.append(x[i])
                true_output.append(y[i])
            else:
                false_tree.append(x[i])
                false_output.append(y[i])
        #storing important data into tree
        Node.split_feature=split_gini_feature
        Node.split_class=split_gini_class
        Node.true_tree=self.BuildTree(np.array(true_tree),true_output)
        Node.false_tree=self.BuildTree(np.array(false_tree),false_output)
        return Node
    #function to predict the output
    def predict(self,x):
        x=np.array(x)
        y_pred=[]
        for i in range(len(x)):
            y_pred.append(self.predict_one(x[i]))
        return np.array(y_pred)
    def predict_one(self,x):
        x=np.array(x)
        root=self.tree
        while root:
                if root.split_feature!=None:
                    if x[root.split_feature]==root.split_class:
                        if root.true_tree:
                            root=root.true_tree
                    else:
                        if root.false_tree:
                            root=root.false_tree
                else:
                    return root.answer
dt=DecisonTree()
dt.fit(np.array(x_train),np.array(y_train))


Split on feature----------------------- 1
Split on class------------------------ 0
Gini value------------------------- 0.13372854794476438

--------------------------------------------------------

Split on feature----------------------- 0
Split on class------------------------ 3
Gini value------------------------- 0.10444138444657812

--------------------------------------------------------

Split on feature----------------------- 2
Split on class------------------------ 0
Gini value------------------------- 0.03084848044956845

--------------------------------------------------------

Split on feature----------------------- 3
Split on class------------------------ 5
Gini value------------------------- 0.027421024650941483

--------------------------------------------------------

Split on feature----------------------- 3
Split on class------------------------ 4
Gini value------------------------- 0.014998469543924031

--------------------------------------------------------

Split on

In [322]:
y_pred=dt.predict(x_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[375  24]
 [ 97 172]]
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       399
           1       0.88      0.64      0.74       269

    accuracy                           0.82       668
   macro avg       0.84      0.79      0.80       668
weighted avg       0.83      0.82      0.81       668



In [323]:
from sklearn.tree import DecisionTreeClassifier
df=DecisionTreeClassifier()
df.fit(x_train,y_train)
y_pred=df.predict(x_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[375  24]
 [ 97 172]]
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       399
           1       0.88      0.64      0.74       269

    accuracy                           0.82       668
   macro avg       0.84      0.79      0.80       668
weighted avg       0.83      0.82      0.81       668



In [324]:
##we could see that our classifier is working more accurate than the inbuilt classifier
##enjoy and keep coding :)
#code written by Uday kiran Bakka