## Problem Statement : To predict the survival rate in Titanic Ship Crash using Decision Tree Classifier

In [4]:
#Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
#Load/Import the dataset
data=pd.read_csv("Titanic.csv")
data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [6]:
#Explore Dataset

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Survived     418 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [8]:
data.shape

(418, 12)

In [9]:
data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,418.0,418.0,332.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188,0.363636
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576,0.481622
min,892.0,1.0,0.17,0.0,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958,0.0
50%,1100.5,3.0,27.0,0.0,0.0,14.4542,0.0
75%,1204.75,3.0,39.0,1.0,0.0,31.5,1.0
max,1309.0,3.0,76.0,8.0,9.0,512.3292,1.0


In [15]:
#Data Cleaning /Find Missiing data

In [16]:
data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Survived         0
dtype: int64

In [17]:
#Age coulmn has missing data
#replace null values with mean value of age
data.Age = data.Age.fillna(data.Age.mean())

In [18]:
data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Survived         0
dtype: int64

In [19]:
data.Fare=data.Fare.fillna(data.Fare.mean())

In [20]:
#Eliminate the features with insignificant information
data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Embarked','Cabin'],axis='columns',inplace=True)
data.head(5)

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,34.5,7.8292,0
1,3,female,47.0,7.0,1
2,2,male,62.0,9.6875,0
3,3,male,27.0,8.6625,0
4,3,female,22.0,12.2875,1


In [21]:
#Encoding Categorical data
from sklearn.preprocessing import LabelEncoder
le_Sex =LabelEncoder() 

In [22]:
data['gender'] = le_Sex.fit_transform(data['Sex'])
data

Unnamed: 0,Pclass,Sex,Age,Fare,Survived,gender
0,3,male,34.50000,7.8292,0,1
1,3,female,47.00000,7.0000,1,0
2,2,male,62.00000,9.6875,0,1
3,3,male,27.00000,8.6625,0,1
4,3,female,22.00000,12.2875,1,0
...,...,...,...,...,...,...
413,3,male,30.27259,8.0500,0,1
414,1,female,39.00000,108.9000,1,0
415,3,male,38.50000,7.2500,0,1
416,3,male,30.27259,8.0500,0,1


In [23]:
data.drop('Sex',axis='columns',inplace=True)
data

Unnamed: 0,Pclass,Age,Fare,Survived,gender
0,3,34.50000,7.8292,0,1
1,3,47.00000,7.0000,1,0
2,2,62.00000,9.6875,0,1
3,3,27.00000,8.6625,0,1
4,3,22.00000,12.2875,1,0
...,...,...,...,...,...
413,3,30.27259,8.0500,0,1
414,1,39.00000,108.9000,1,0
415,3,38.50000,7.2500,0,1
416,3,30.27259,8.0500,0,1


In [24]:
x=data.drop('Survived',axis='columns')
x

Unnamed: 0,Pclass,Age,Fare,gender
0,3,34.50000,7.8292,1
1,3,47.00000,7.0000,0
2,2,62.00000,9.6875,1
3,3,27.00000,8.6625,1
4,3,22.00000,12.2875,0
...,...,...,...,...
413,3,30.27259,8.0500,1
414,1,39.00000,108.9000,0
415,3,38.50000,7.2500,1
416,3,30.27259,8.0500,1


In [25]:
y=data.Survived
y=y.values.reshape(-1,1)
y.shape

(418, 1)

In [29]:
from sklearn import tree

In [30]:
model = tree.DecisionTreeClassifier(criterion='gini', random_state=0)

In [31]:
model.fit(x,y)

DecisionTreeClassifier(random_state=0)

In [32]:
#Split data into training set and test set

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=0)


In [34]:
#feature Scaling  
from sklearn.preprocessing import StandardScaler    
st= StandardScaler()  
x_train= st.fit_transform(x_train)    
x_test= st.transform(x_test) 

In [36]:
y_train.shape


(334, 1)

In [37]:
y_train.shape


(334, 1)

In [38]:
y_test.shape

(84, 1)

# model.fit(x_train,y_train)

In [39]:
y_pred=model.predict(x_test)
y_pred




array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)

In [40]:
y_test =np.ravel(y_test)
y_test

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)

In [41]:
model.score(x_test,y_test)



1.0

In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        39

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [43]:
model.predict([[1,55,59.4,1]])



array([0], dtype=int64)