The Titanic project - Model building

Import the data

In [1]:
#importing the libraries and the Titanic dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#df initially ingests the Titanic data
df = pd.read_csv('titanic_dataset.csv') #read the Titanic data

Data pre-processing

In [2]:
# Adding a column Family_Size
df['Family_Size'] = 0
df['Family_Size'] = df['Parch']+df['SibSp']
 
# Adding a column Alone
df['Alone'] = 0
df.loc[df.Family_Size == 0, 'Alone'] = 1

In [3]:
avg_age_M = df[df.Sex =='male'].Age.mean()
avg_age_F = df[df.Sex =='female'].Age.mean()
#print(avg_age_M)
#print(avg_age_F)
no_age_list = df.Age.isna() #build list of passengers that need their age fixed

In [4]:
#Filling in the null values in Embarked and Age
df.Embarked = df.Embarked.fillna((df.Embarked.mode()[0])) #filling in Embarked field with the mode (S for the training set)
df.loc[(no_age_list & (df.Sex=='male')), 'Age'] = avg_age_M #all men w/o age get the average age for men
df.loc[(no_age_list & (df.Sex=='female')), 'Age'] = avg_age_F #all women w/o age get the average age for women
df_target = df.Survived
df_data = df.drop(['PassengerId', 'Survived', 'Name','SibSp','Parch','Ticket','Cabin'], axis=1)

In [5]:
#Scaling the numbers
from sklearn.preprocessing import StandardScaler
train_numerical_features = list(df_data.select_dtypes(include=['int64', 'float64', 'int32']).columns)
print(train_numerical_features)
ss_scaler = StandardScaler()
train_df_ss = pd.DataFrame(data = df_data)
train_df_ss[train_numerical_features] = ss_scaler.fit_transform(train_df_ss[train_numerical_features])
train_df_ss.head()

['Pclass', 'Age', 'Fare', 'Family_Size', 'Alone']


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_Size,Alone
0,0.827377,male,-0.594732,-0.502445,S,0.05916,-1.231645
1,-1.566107,female,0.635319,0.786845,C,0.05916,-1.231645
2,0.827377,female,-0.28722,-0.488854,S,-0.560975,0.811922
3,-1.566107,female,0.404684,0.42073,S,0.05916,-1.231645
4,0.827377,male,0.404684,-0.486337,S,-0.560975,0.811922


In [6]:
#One hot encoding of categorical data (Sex and Embarked)
one_hot_Sex = pd.get_dummies(train_df_ss['Sex'])
train_df_ss = train_df_ss.join(one_hot_Sex)
one_hot_embarked = pd.get_dummies(train_df_ss['Embarked'])
train_df_ss = train_df_ss.join(one_hot_embarked)
train_df_ss.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_Size,Alone,female,male,C,Q,S
0,0.827377,male,-0.594732,-0.502445,S,0.05916,-1.231645,0,1,0,0,1
1,-1.566107,female,0.635319,0.786845,C,0.05916,-1.231645,1,0,1,0,0
2,0.827377,female,-0.28722,-0.488854,S,-0.560975,0.811922,1,0,0,0,1
3,-1.566107,female,0.404684,0.42073,S,0.05916,-1.231645,1,0,0,0,1
4,0.827377,male,0.404684,-0.486337,S,-0.560975,0.811922,0,1,0,0,1


In [7]:
#Dropping columns rendered irrelevant from the training set DataFrame
train_df_ss = train_df_ss.drop(['Sex','Embarked','male'], axis=1)
train_df_ss.head()

Unnamed: 0,Pclass,Age,Fare,Family_Size,Alone,female,C,Q,S
0,0.827377,-0.594732,-0.502445,0.05916,-1.231645,0,0,0,1
1,-1.566107,0.635319,0.786845,0.05916,-1.231645,1,1,0,0
2,0.827377,-0.28722,-0.488854,-0.560975,0.811922,1,0,0,1
3,-1.566107,0.404684,0.42073,0.05916,-1.231645,1,0,0,1
4,0.827377,0.404684,-0.486337,-0.560975,0.811922,0,0,0,1


Model training

In [8]:
from sklearn.model_selection import cross_val_score 

In [9]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
#split prepared Titanic data between the test set (after the best model is chosen) 
#and the train set (which will be cross-validated) 
X_train, X_test, y_train, y_test = train_test_split(train_df_ss, df_target, test_size=0.2, random_state=83)

In [10]:
#cross-validate Logistic Regression model 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
print('Logistic Regression cross validation of training data: ' + str(cross_val_score(logreg, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('Logistic Regression validation on test data: ' + str(metrics.accuracy_score(logreg.predict(X_test), y_test)) + '\n')

#cross-validate Nearest Neighbors model 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
print('KNN cross validation of training data: ' + str(cross_val_score(logreg, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('KNN validation on test data: ' + str(metrics.accuracy_score(logreg.predict(X_test), y_test)) + '\n')

#cross-validate Gaussian Process model 
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X_train, y_train)
print('Gaussian Process Classifier cross validation of training data: ' + str(cross_val_score(gpc_rbf_isotropic, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('Gaussian Process Classifier validation on test data: ' + str(metrics.accuracy_score(gpc_rbf_isotropic.predict(X_test), y_test)) + '\n')

#cross-validate naive bayes model 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)  # GaussianNB itself does not support sample-weights
print('Naive Bayes cross validation of training data: ' + str(cross_val_score(gnb, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('Naive Bayes validation on test data: ' + str(metrics.accuracy_score(gnb.predict(X_test), y_test)) + '\n')

#cross-validate Decision Tree model 
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)  # GaussianNB itself does not support sample-weights
print('Decision Tree cross validation of training data: ' + str(cross_val_score(dtc, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('Decision Tree validation on test data: ' + str(metrics.accuracy_score(dtc.predict(X_test), y_test)) + '\n')

#cross-validate Random Forest model 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print('Random Forest cross validation of training data: ' + str(cross_val_score(rfc, X_train, y_train, cv=10, scoring='accuracy').mean()))
print('Random Forest validation on test data: ' + str(metrics.accuracy_score(rfc.predict(X_test), y_test)) + '\n')


Logistic Regression cross validation of training data: 0.8047926447574335
Logistic Regression validation on test data: 0.8100558659217877

KNN cross validation of training data: 0.8047926447574335
KNN validation on test data: 0.8100558659217877

Gaussian Process Classifier cross validation of training data: 0.814651799687011
Gaussian Process Classifier validation on test data: 0.8324022346368715

Naive Bayes cross validation of training data: 0.7767410015649452
Naive Bayes validation on test data: 0.8044692737430168

Decision Tree cross validation of training data: 0.779557902973396
Decision Tree validation on test data: 0.770949720670391

Random Forest cross validation of training data: 0.8076486697965573
Random Forest validation on test data: 0.8212290502793296



Parameter Tuning

In [11]:
from sklearn.model_selection import GridSearchCV

Exporting the model

In [12]:
import joblib
joblib.dump(gpc_rbf_isotropic, 'Titanic_model.pkl')

['Titanic_model.pkl']