## To model survival of the Titanic disaster 5 of the more common place Machine Learning Algorithms were used including Logistic Regression, Perceptron, Linear Support Vector Machine, Random Forest and a Decision Tree classifier. 

In [293]:
# libraries used for data manipulation
import pandas as pd
import numpy as np
# libraries used for predictive modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# libraries used for data visualizations
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
# ignore warnings from plotting
import warnings
warnings.filterwarnings("ignore")

In [264]:
# Read in initial training and testing datasets locally
titanictrain = pd.read_csv('C:\\Users\\timho\\OneDrive\\Desktop\\ML\\Titanictrain.csv')
titanictest = pd.read_csv('C:\\Users\\timho\\OneDrive\\Desktop\\ML\\Titanictest.csv')

In [265]:
frames = [titanictest, titanictrain]

In [266]:
fulldf = pd.concat(frames)
fulldf.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,


In [267]:
# Replace missing values in training set
#fulldf['Cabin'].fillna(fulldf['Cabin'].mode()[0], inplace=True)
fulldf['Age'] = fulldf['Age'].fillna(34.5)

In [268]:
fulldf.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Survived        418
dtype: int64

In [269]:
# Convert string variables to factors
stringvars = fulldf[['Fare', 'Sex', 'Embarked']]

In [270]:
dummyvars = pd.get_dummies(fulldf[['Fare','Embarked', 'Sex']])
dummyvars

Unnamed: 0,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,7.8292,0,1,0,0,1
1,7.0000,0,0,1,1,0
2,9.6875,0,1,0,0,1
3,8.6625,0,0,1,0,1
4,12.2875,0,0,1,1,0
...,...,...,...,...,...,...
886,13.0000,0,0,1,0,1
887,30.0000,0,0,1,1,0
888,23.4500,0,0,1,1,0
889,30.0000,1,0,0,0,1


#### Combine dummy variables with titanic dataset and subset final rows into the training and testing datasets.

In [271]:
finaltrain = pd.concat([fulldf,dummyvars], axis = 1)

In [272]:
finaltrain.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Survived        418
Fare              1
Embarked_C        0
Embarked_Q        0
Embarked_S        0
Sex_female        0
Sex_male          0
dtype: int64

In [273]:
finaltrain = finaltrain.iloc[418:1309]
finaltest = finaltrain.iloc[0:417]

In [274]:
log_reg = LogisticRegression()
y = finaltrain['Survived']

In [275]:
x = finaltrain.drop(['Survived'], axis=1)
x = x.drop(['Embarked'], axis=1)
x = x.drop(['Name'], axis=1)
x = x.drop(['Sex'], axis=1)
x = x.drop(['Ticket'], axis=1)
x = x.drop(['Cabin'], axis=1)
x.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Sex_female     0
Sex_male       0
dtype: int64

In [276]:

finaltest = finaltest.drop(['Embarked'], axis=1)
finaltest = finaltest.drop(['Name'], axis=1)
finaltest = finaltest.drop(['Sex'], axis=1)
finaltest = finaltest.drop(['Ticket'], axis=1)
finaltest = finaltest.drop(['Cabin'], axis=1)
finaltest = finaltest.drop(['Survived'], axis=1)
finaltest.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Sex_female     0
Sex_male       0
dtype: int64

In [277]:
log_reg.fit(x,y)

LogisticRegression()

In [301]:
log_regacc = round(log_reg.score(x,y) * 100, 2)

In [279]:
ypred = log_regression.predict(finaltest)

In [281]:
coeff_df = pd.DataFrame(x.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(log_reg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
10,Sex_male,1.98169
7,Embarked_Q,0.66429
9,Sex_female,0.358118
8,Embarked_S,0.116821
5,Fare,0.00223
6,Embarked_C,0.00223
0,Pclass,0.000393
2,SibSp,-0.024057
3,Parch,-0.121622
4,Fare,-0.222555


In [287]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x, y)

Y_prediction = random_forest.predict(finaltest)

random_forest.score(x, y)
acc_random_forest = round(random_forest.score(x, y) * 100, 2)

In [288]:
acc_random_forest

100.0

In [289]:
perceptron = Perceptron(max_iter=3)
perceptron.fit(x, y)

Y_pred = perceptron.predict(finaltest)

acc_perceptron = round(perceptron.score(x, y) * 100, 2)

In [290]:
acc_perceptron

52.75

In [294]:
linear_svc = LinearSVC()
linear_svc.fit(x, y)

Y_pred = linear_svc.predict(finaltest)

acc_linear_svc = round(linear_svc.score(x, y) * 100, 2)

In [295]:
acc_linear_svc

76.43

In [296]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(x, y)  

Y_pred = decision_tree.predict(finaltest)  

acc_decision_tree = round(decision_tree.score(x, y) * 100, 2)

In [297]:
acc_decision_tree

100.0

#### Grading the five models based on accuracy it appears that the Random Forest or Decision Tree models would classify the survivors of the Titanic disaster best. Next steps involve looking into these models further to test overfitting as well as further extract feature importance to make models more explainable.

In [302]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'Logistic Regression', 
              'Random Forest', 'Perceptron', 'Decision Tree'],
    'Score': [acc_linear_svc, log_regacc, 
              acc_random_forest, acc_perceptron, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head()

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
100.0,Random Forest
100.0,Decision Tree
80.13,Logistic Regression
76.43,Support Vector Machines
52.75,Perceptron
