In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import re
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

In [24]:
# reading the file

df1 = pd.read_csv('train_titanic.csv')


In [25]:
# to check if any data is missing

df1.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [26]:
# dropping this column as it does not affect whether the passenger survived or not

df1.drop('PassengerId', axis=1, inplace = True)


In [27]:
y = df1['Survived']

In [28]:
x = df1[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Name', 'Cabin']]


In [29]:
x['Sex'].replace(['male', 'female'],
                        [0, 1], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Sex'].replace(['male', 'female'],


In [30]:
df1.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [31]:
# finding the mean of age of the passengers

mean = x['Age'].mean()

In [32]:
# instead of removing the rows where age value is missing, replacing it by mean of age of the passengers

x['Age'].fillna(mean,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'].fillna(mean,inplace = True)


In [33]:
# replacing missing values by 0

x['Cabin'].fillna(0,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Cabin'].fillna(0,inplace = True)


In [34]:
# replacing missing values by 0 and other values by 1
for i in range(0,891):
    if x['Cabin'][i] != 0:
        x['Cabin'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Cabin'][i] = 1


In [35]:
x['Name-len'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Name-len'] = 0


In [36]:
# removing name and storing the name length, as name length can be important because of some royalty maybe

for i in range(0,891):
    x['Name'][i] = re.sub(r'[^\w\s]', '', x['Name'][i])
    new_lis = x['Name'][i].split(' ') 
    for j in range(0,len(new_lis)):
        x['Name-len'][i] += len(new_lis[j])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Name'][i] = re.sub(r'[^\w\s]', '', x['Name'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Name-len'][i] += len(new_lis[j])


In [37]:
# dropping the name column

x.drop('Name', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop('Name', axis = 1, inplace = True)


In [38]:
y.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [39]:
scaler = StandardScaler()

In [40]:
x = scaler.fit_transform(x)

In [41]:
# training the data

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [43]:
dtree = DecisionTreeClassifier(min_samples_leaf=2, max_depth = 2, max_features = 2)

In [44]:
dtree.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=2, max_features=2, min_samples_leaf=2)

In [45]:
predictions = dtree.predict(x_test)

In [46]:
print(confusion_matrix(y_test,predictions))

[[138  19]
 [ 66  45]]


In [57]:
dtree.score(x_train,y_train)

0.6966292134831461

In [56]:
dtree.score(x_test,y_test)

0.6828358208955224

In [47]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.68      0.88      0.76       157
           1       0.70      0.41      0.51       111

    accuracy                           0.68       268
   macro avg       0.69      0.64      0.64       268
weighted avg       0.69      0.68      0.66       268



In [48]:
rfc = RandomForestClassifier(n_estimators = 200, max_depth=3, max_features=7, min_samples_leaf= 5, min_samples_split=4, max_leaf_nodes = 5) 

In [49]:
rfc.fit(x_train,y_train)

RandomForestClassifier(max_depth=3, max_features=7, max_leaf_nodes=5,
                       min_samples_leaf=5, min_samples_split=4,
                       n_estimators=200)

In [50]:
rfc_pred = rfc.predict(x_test)

In [51]:
print(confusion_matrix(y_test,rfc_pred))

[[143  14]
 [ 37  74]]


In [52]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85       157
           1       0.84      0.67      0.74       111

    accuracy                           0.81       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268



In [53]:
rfc.score(x_test,y_test)

0.8097014925373134

In [54]:
rfc.score(x_train,y_train)

0.8394863563402889