In [1]:
import pandas as pd
import numpy as np

In [26]:
train_data = pd.read_csv('Titanic.csv')
test_data = pd.read_csv('Titanic test.csv')

In [27]:
train_data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [28]:
columns_to_drop=["name","ticket","cabin","embarked","boat","body","home.dest"]
traindata_clean = train_data.drop(columns_to_drop,axis =1)
testdata_clean = test_data.drop(columns_to_drop,axis = 1)

In [29]:

traindata_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [30]:
traindata_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null object
age         812 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1008 non-null float64
dtypes: float64(6), object(1)
memory usage: 55.3+ KB


In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
le = LabelEncoder()

In [35]:
traindata_clean['sex'] = le.fit_transform(traindata_clean['sex'])
testdata_clean['sex'] = le.fit_transform(testdata_clean['sex'])

In [37]:
traindata_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [38]:
traindata_clean=traindata_clean.fillna(traindata_clean["age"].mean())

In [40]:
testdata_clean=testdata_clean.fillna(testdata_clean["age"].mean())

In [42]:
input_cols = ['pclass','sex','age','sibsp','parch','fare']
output_cols = ['survived']


In [44]:
x_train=traindata_clean[input_cols]
y_train=traindata_clean[output_cols]

x_test=testdata_clean[input_cols]

In [47]:
print(x_train.shape,y_train.shape)
print(x_test.shape)

(1009, 6) (1009, 1)
(300, 6)


In [48]:
def entropy(col):
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    entropy = 0.0
    for ix in counts[1]:
        p = ix/N
        entropy += (-1*p*np.log2(p))
    return entropy
    

In [49]:
def divide_data(x_data,fkey,fval):
    # works with pandas dataframe 
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
    return x_left,x_right

In [50]:
def information_gain(x_data,fkey,fval): # here fkey is features name like sex , income , fare
                                        # here fval is  threshold value of that feature name that we defined
        left,right = divide_data(x_data,fkey,fval)
        
        # compute % of total sample on left and right
        l = float(left.shape[0])/x_data.shape[0]
        r = float(right.shape[0])/x_data.shape[0]
        
        if left.shape[0]==0 or right.shape[0]==0:
            return -1000000 # min information gain
        
        i_gain = entropy(x_data.survived) - (l*entropy(left.survived) + r*entropy(right.survived))
        return i_gain

In [51]:
class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self,X_train):
        
        features = ['pclass','sex','age','sibsp','parch','fare']

        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        print("Making Tree Features is",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #Truly a left node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #You can set the target at every node
        if X_train.survived.mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
        return
    def predict(self,test):
        if test[self.fkey]>self.fval:
            # go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [52]:
dt = DecisionTree()

In [54]:
dt.train(traindata_clean)

Making Tree Features is sex
Making Tree Features is pclass
Making Tree Features is pclass
Making Tree Features is fare
Making Tree Features is sibsp
Making Tree Features is age
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is parch
Making Tree Features is fare
Making Tree Features is sibsp
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is parch
Making Tree Features is sibsp
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is sibsp
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is parch
Making T

In [86]:
y_pred = []
for ix in range(train_data.shape[0]):
    y_pred.append(dt.predict(traindata_clean.loc[ix]))

In [87]:
le =LabelEncoder()

In [88]:
y_pred = le.fit_transform(y_pred)

In [89]:
print(y_pred.shape)

(1009,)


In [90]:
y_actual = traindata_clean[output_cols]

In [66]:
y_actual

In [91]:
y_actual.shape

(1009, 1)

In [92]:
y_pred = np.array(y_pred).reshape((-1,1))
print(y_pred.shape)

(1009, 1)


In [94]:
acc  =np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

In [96]:
print(acc)

0.8057482656095144


In [93]:
y_pred=pd.DataFrame(y_pred,columns=['survived'])

In [74]:
y_pred.to_csv('titanic_predictions.csv')

### Decision Tree using Sklearn

In [97]:
from sklearn.tree import DecisionTreeClassifier

In [107]:
sk_tree  = DecisionTreeClassifier(criterion='entropy',max_depth=5)

In [104]:
sk_tree.fit(traindata_clean[input_cols],traindata_clean[output_cols])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [105]:
sk_tree.predict(testdata_clean[input_cols])

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 1., 1., 0.

In [106]:
sk_tree.score(traindata_clean[input_cols],traindata_clean[output_cols])

0.8275520317145689

In [109]:
!pip install pydotplus


Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
Building wheels for collected packages: pydotplus
  Building wheel for pydotplus (setup.py): started
  Building wheel for pydotplus (setup.py): finished with status 'done'
  Created wheel for pydotplus: filename=pydotplus-2.0.2-py3-none-any.whl size=24571 sha256=2c2f6fd8bcaff65e86ccf20240a80e37fd532e9fa5ae52affb7c14fbb5bafa46
  Stored in directory: c:\users\ritik\appdata\local\pip\cache\wheels\1e\7b\04\7387cf6cc9e48b4a96e361b0be812f0708b394b821bf8c9c50
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2
