In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("titanic.csv")
print(data.head(n=10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [3]:
print(data.info()) #Age, Cabin, and Embarked are not there for every passenger so we might need to select some values for the missing spaces
#But age is important column, while name doesn't matter
#So we will fill the values for ages, and just drop name

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [4]:
DropColumns = ['PassengerId','Cabin','Embarked','Cabin','Name'] #We don't need these columns
clean = data.drop(DropColumns,axis=1) #Removes those columns
clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare
0,0,3,male,22.0,1,0,A/5 21171,7.25
1,1,1,female,38.0,1,0,PC 17599,71.2833
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925
3,1,1,female,35.0,1,0,113803,53.1
4,0,3,male,35.0,0,0,373450,8.05


In [5]:
from sklearn.preprocessing import LabelEncoder
#We can see that Sex is given in text while other thing is given in numbers
#Hence we use Label Encoder to convert text into number
le = LabelEncoder()
clean['Sex'] = le.fit_transform(clean["Sex"]) 
clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare
0,0,3,1,22.0,1,0,A/5 21171,7.25
1,1,1,0,38.0,1,0,PC 17599,71.2833
2,1,3,0,26.0,0,0,STON/O2. 3101282,7.925
3,1,1,0,35.0,1,0,113803,53.1
4,0,3,1,35.0,0,0,373450,8.05


In [6]:
clean = clean.drop('Ticket',axis=1)
clean.head()
#Realized that ticket number is not important to dropped it too

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [7]:
#Let's check info about data again
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


In [8]:
#All the important columns are full
#Need to fill missing values of age
clean = clean.fillna(clean["Age"].mean()) #Fill the missing rows with mean of columns
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


In [9]:
clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [10]:
inpu = ['Pclass',"Sex","Age","SibSp","Parch","Fare"] #Taking all the inputs
output = ["Survived"] #Label

X = clean[inpu] #X contains all the variables except the answer
Y = clean[output] #Stores the answer

print(X.shape,Y.shape)

(891, 6) (891, 1)


In [11]:
#Entropy
#We will try to compute the entropy of every column in the dataset
def entropy(col):
    
    counts = np.unique(col,return_counts=True) #It will return the values present and their counts in that column
    n = float(col.shape[0]) #This will give the value of number of rows in that column
    
    ent = 0.0
    
    for ix in counts[1]: #Will check the count of each value in column
        p  = ix/n #Compute it's probability
        ent += (-1.0*p*np.log2(p)) #Simply put it in formula
    
    return ent

In [13]:
print(entropy(clean['Sex'])) #High entropy

0.9362046432498521


In [14]:
def divide_data(data,fkey,fval): #Data will be whole dataset, fkey will be the column, while fval is the threshold value 
    #Work with Pandas Data Frames
    right = pd.DataFrame([],columns=data.columns) #Creating for values greater than threshold
    left = pd.DataFrame([],columns=data.columns) #Creating for values lesser than threshold
    
    for ix in range(data.shape[0]):
        val = data[fkey].loc[ix] #checking a particular row in that column
        
        if val > fval:
            right = right.append(data.loc[ix]) #values greater than threshold go to right
        else:
            left = left.append(data.loc[ix]) #values lesser than threshold go to left
            
    return left,right

In [15]:
left, right = divide_data(clean[:10],"Sex",0.5) #Just testing
print(left) #Just testing
print(right)

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
1       1.0     1.0  0.0  38.0    1.0    0.0  71.2833
2       1.0     3.0  0.0  26.0    0.0    0.0   7.9250
3       1.0     1.0  0.0  35.0    1.0    0.0  53.1000
8       1.0     3.0  0.0  27.0    0.0    2.0  11.1333
9       1.0     2.0  0.0  14.0    1.0    0.0  30.0708
   Survived  Pclass  Sex        Age  SibSp  Parch     Fare
0       0.0     3.0  1.0  22.000000    1.0    0.0   7.2500
4       0.0     3.0  1.0  35.000000    0.0    0.0   8.0500
5       0.0     3.0  1.0  29.699118    0.0    0.0   8.4583
6       0.0     1.0  1.0  54.000000    0.0    0.0  51.8625
7       0.0     3.0  1.0   2.000000    3.0    1.0  21.0750


In [16]:
def information_gain(data,fkey,fval):
    
    left,right = divide_data(data,fkey,fval) #It will divide and give data
    
    #% of total samples are on left and right
    l = float(left.shape[0])/data.shape[0] #Will calculate probability of left side
    r = float(right.shape[0])/data.shape[0] #Will calculate probability of right side
    
    #All examples come to one side!
    if left.shape[0] == 0 or right.shape[0] ==0: #No point doing it if all is going to one side
        return -1000000 #Min Information Gain
    
    gain = entropy(data.Survived) - (l*entropy(left.Survived)+r*entropy(right.Survived)) #Simply putting in formula
    return gain

In [20]:
for col in X.columns:
    print(col)
    print(information_gain(clean,col,clean[col].mean()))
    #This will give the information gain for all the columns and we can use it for decision tree

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.001158644038169343
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


In [21]:
class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None #We will need left
        self.right = None #We will need right
        self.fkey = None #The column to be considered
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth 
        self.target = None #The answer
        
    def train(self,X_train):
        
        features = ['Pclass','Sex','Age','SibSp', 'Parch', 'Fare'] #All the features
        info_gains = [] #Will keep storing the info gains of all columns here
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean()) #Will calculate all the features of information gains
            info_gains.append(i_gain) #Store all the gains here
            
        self.fkey = features[np.argmax(info_gains)] #The column with maximum info gain is made the key
        self.fval = X_train[self.fkey].mean() #Dividing the data
        print("Making Tree Features is",self.fkey) #Storing the information about which node is being made
        
        #Split Data
        left,right = divide_data(X_train,self.fkey,self.fval) #Dividing into left and right
        left = left.reset_index(drop=True) #Then resetting the row values
        right = right.reset_index(drop=True) #Then resetting the row values
         
        #Truly a left node
        if left.shape[0]  == 0 or right.shape[0] ==0: #If no more information gain is possible
            if X_train.Survived.mean() >= 0.5: #Deciding the value according to the mean on that node
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth): #Don't want to overfit
            if X_train.Survived.mean() >= 0.5: #Deciding the value according to the mean on that node
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth) #The class needs to be called again, with increase in current depth
        self.left.train(left) #Training the left
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth) #Same as left
        self.right.train(right)
        
        #You can set the target at every node
        if X_train.Survived.mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
        return
    def predict(self,test):
        if test[self.fkey]>self.fval: #If it is greater than mean it should go to right node
            #go to right
            if self.right is None: #Now if right node is not there, answer needs to be returned
                return self.target
            return self.right.predict(test) #Otherwise go to right and repeat whole process
        else:
            if self.left is None: #Now if left node is not there, answer needs to be returned
                return self.target
            return self.left.predict(test) #Otherwise go to left and repeat whole process

In [23]:
split = int(0.8*clean.shape[0]) #Taking split as 80% of total number of rows
train_data = clean[:split] #Taking first 80% of rows in train set
test_data = clean[split:] #Taking other rows in test set
test_data = test_data.reset_index(drop=True)

In [24]:
print(train_data.shape,test_data.shape)

(712, 7) (179, 7)


In [25]:
dt = DecisionTree() #Creating the instance of class
dt.train(train_data) #Training the model

Making Tree Features is Sex
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Parch
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree 

In [26]:
pred = []
for ix in range(test_data.shape[0]): #All the predictions made for the test set
    pred.append(dt.predict(test_data.loc[ix])) #All the predictions stored

In [27]:
actual = test_data[output]

In [28]:
#The answer is in the form of dead and survived so need to convert to 0 and 1 using label encoder
le = LabelEncoder()
pred = le.fit_transform(pred)

In [29]:
pred = np.array(pred).reshape((-1,1))
print(pred.shape)
#Converting pred into right shape and converting it to np array

(179, 1)


In [30]:
accuracy = np.sum(np.array(pred)==np.array(actual))/pred.shape[0]
#Calculating the accuracy
#It is done by taking sum of number of same in pred and actual dividing by total number of prediction
print(accuracy)

0.8715083798882681
