Predicting Survival in the Titanic Data Set
We will be using a decision tree to make predictions about the Titanic data
set from Kaggle. This data set provides information on the Titanic
passengers and can be used to predict whether a passenger survived or
not.
You use only Pclass, Sex, Age, SibSp (Siblings aboard), Parch
(Parents/children aboard), and Fare to predict whether a passenger
survived.
NOTE:

In [177]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

Url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(Url)
titanic.columns =['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
import warnings

In [178]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [179]:
df=titanic[['Pclass','Sex','Age','SibSp','Parch','Fare','Survived']]

In [180]:
df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,714.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104,0.0
50%,3.0,28.0,0.0,0.0,14.4542,0.0
75%,3.0,38.0,1.0,0.0,31.0,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [181]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.25,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.925,1
3,1,female,35.0,1,0,53.1,1
4,3,male,35.0,0,0,8.05,0


In [182]:
df['Age']=df['Age'].fillna(df['Age'].median())

In [185]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Survived    0
dtype: int64

In [186]:
warnings.filterwarnings('ignore')

label=LabelEncoder()
df['Sex']=label.fit_transform(df['Sex'])

In [188]:
X=df.drop('Survived',axis=1)
y=df['Survived']

In [191]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state= 355)

In [192]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

DecisionTreeClassifier()

In [193]:
clf.score(x_train,y_train)

0.9850299401197605

In [194]:
py_pred = clf.predict(x_test)

In [195]:
clf.score(x_test,y_test)

0.7309417040358744

In [196]:
scalar = StandardScaler()

x_transform = scalar.fit_transform(X)

In [197]:
x_train,x_test,y_train,y_test = train_test_split(x_transform,y,test_size = 0.25, random_state= 355)

In [198]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.7309417040358744

In [199]:
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [200]:
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [201]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']})

In [202]:
best_parameters = grid_search.best_params_
print(best_parameters)

{'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 3, 'min_samples_split': 3, 'splitter': 'random'}


In [203]:
grid_search.best_score_

0.8293120861856134

In [205]:
clf = DecisionTreeClassifier(criterion = 'entropy', max_depth =30, min_samples_leaf= 3, min_samples_split= 3, splitter ='random')
clf.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=30, min_samples_leaf=3,
                       min_samples_split=3, splitter='random')

In [206]:
clf.score(x_test,y_test)

0.8071748878923767

In [207]:
predictions = clf.predict(x_test)

In [208]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85       145
           1       0.73      0.72      0.72        78

    accuracy                           0.81       223
   macro avg       0.79      0.79      0.79       223
weighted avg       0.81      0.81      0.81       223



In [211]:
clf.predict(scalar.transform([['1','0','38.0','1','0','71.25']]))


array([1], dtype=int64)