# Titanic - Machine Learning from Disaster

In [1]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


### Data loading

In [2]:
train = pd.read_csv("../datasets/train.csv")
print("Number of train samples:", len(train))
print("Dataset columns:",list(train.columns))
train_x = train

# Separates labels from data
train_y = train["Survived"]

print("Train X:")
train_x.head()

Number of train samples: 891
Dataset columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
Train X:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print("Train Y:")
train_y.head()

Train Y:


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [4]:
test_x = pd.read_csv("../datasets/test.csv")
print("Number of test samples:", len(test_x))
test_ids = test_x["PassengerId"]

print("Test X:")
test_x.head()

Number of test samples: 418
Test X:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Dataset statistics
For each value of each *basic* categorical feature we compute the percentage of survived.

In [5]:
display(train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
display(train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean())
display(train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean())
display(train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


Unnamed: 0,SibSp,Survived
0,0,0.345395
1,1,0.535885
2,2,0.464286
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


Unnamed: 0,Parch,Survived
0,0,0.343658
1,1,0.550847
2,2,0.5
3,3,0.6
4,4,0.0
5,5,0.2
6,6,0.0


Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


### Removing identifiers from x

In [6]:
columns_to_drop = ["PassengerId", "Ticket"]
train_x = train_x.drop(columns=columns_to_drop)
test_x = test_x.drop(columns=columns_to_drop)

In [7]:
train_x.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S
5,0,3,"Moran, Mr. James",male,,0,0,8.4583,,Q
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,,S
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333,,S
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,,C


### Baseline

Sex:

    -male   -> 0 (not survived) 
    -female -> 1 (survived)

In [8]:
sex = train["Sex"]
prediction = [0 if s == "male" else 1 for s in sex]

count = 0
for p,l in zip (prediction, list(train_y)):
    if p == l:
        count += 1
accuracy = count / len(train) * 100
print("Baseline accuracy: " + str(accuracy) + "%.")

Baseline accuracy: 78.67564534231201%.


### Feature Preprocessing

    - Discretize "Sex"
    - Fill missing values of "Age"
    - Discritize "Cabin" considering only the first letter + manage missing values
    - Discretize "Embarked" + manage missing values
    - Retrieve the title from "Name"

In [9]:
# Feature "Sex" discretization
converter = {"Sex":     {"male": 0, "female": 1}}
train_x = train_x.replace(converter)
test_x = test_x.replace(converter)

In [10]:
train_x.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,,S
5,0,3,"Moran, Mr. James",0,,0,0,8.4583,,Q
6,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,51.8625,E46,S
7,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,21.075,,S
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,11.1333,,S
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,30.0708,,C


In [11]:
# Discretize "Embarked" + manage missing values

#train_x["Embarked"] = train_x["Embarked"].fillna("M")

train_x = pd.get_dummies(train_x, columns=["Embarked"])
test_x = pd.get_dummies(test_x, columns=["Embarked"])

display(train_x[['Embarked_C', 'Survived']].groupby(['Embarked_C'], as_index=False).mean())
display(train_x[['Embarked_Q', 'Survived']].groupby(['Embarked_Q'], as_index=False).mean())
display(train_x[['Embarked_S', 'Survived']].groupby(['Embarked_S'], as_index=False).mean())

Unnamed: 0,Embarked_C,Survived
0,0,0.344398
1,1,0.553571


Unnamed: 0,Embarked_Q,Survived
0,0,0.383292
1,1,0.38961


Unnamed: 0,Embarked_S,Survived
0,0,0.506073
1,1,0.336957


In [12]:
train_x.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,1,0,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,C123,0,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,,0,0,1


In [13]:
# Discritize "Cabin" considering only the first letter + manage missing values
# training set

def discretize_cabin(df):
    cabins = df["Cabin"]
    new_cabins = []
    for c in cabins:
        if type(c).__name__ == "str":
            new_cabins.append(c[0])
        else:
            new_cabins.append(c)
    df["Cabin"] = new_cabins
    df = pd.get_dummies(df, columns=["Cabin"])
    return df

train_x = discretize_cabin(train_x)
test_x = discretize_cabin(test_x)

display(train_x[['Cabin_A', 'Survived']].groupby(['Cabin_A'], as_index=False).mean())
display(train_x[['Cabin_B', 'Survived']].groupby(['Cabin_B'], as_index=False).mean())
display(train_x[['Cabin_C', 'Survived']].groupby(['Cabin_C'], as_index=False).mean())
display(train_x[['Cabin_D', 'Survived']].groupby(['Cabin_D'], as_index=False).mean())
display(train_x[['Cabin_E', 'Survived']].groupby(['Cabin_E'], as_index=False).mean())
display(train_x[['Cabin_F', 'Survived']].groupby(['Cabin_F'], as_index=False).mean())
display(train_x[['Cabin_G', 'Survived']].groupby(['Cabin_G'], as_index=False).mean())
display(train_x[['Cabin_T', 'Survived']].groupby(['Cabin_T'], as_index=False).mean())

Unnamed: 0,Cabin_A,Survived
0,0,0.38242
1,1,0.466667


Unnamed: 0,Cabin_B,Survived
0,0,0.363744
1,1,0.744681


Unnamed: 0,Cabin_C,Survived
0,0,0.36899
1,1,0.59322


Unnamed: 0,Cabin_D,Survived
0,0,0.369464
1,1,0.757576


Unnamed: 0,Cabin_E,Survived
0,0,0.370198
1,1,0.75


Unnamed: 0,Cabin_F,Survived
0,0,0.38041
1,1,0.615385


Unnamed: 0,Cabin_G,Survived
0,0,0.383315
1,1,0.5


Unnamed: 0,Cabin_T,Survived
0,0,0.38427
1,1,0.0


In [14]:
print(train_x.shape)
train_x.head(10)

(891, 19)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,0,1,0,0,0,0,0,0,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,0,0,0,0,1,0,0,0,0,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0,1,0,0,0,0,0,0,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,0,1,0,0,1,0,0,0,0,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,0,1,0,0,0,0,0,0,0,0
5,0,3,"Moran, Mr. James",0,,0,0,8.4583,0,1,0,0,0,0,0,0,0,0,0
6,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,51.8625,0,0,1,0,0,0,0,1,0,0,0
7,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,21.075,0,0,1,0,0,0,0,0,0,0,0
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,11.1333,0,0,1,0,0,0,0,0,0,0,0
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,30.0708,1,0,0,0,0,0,0,0,0,0,0


In [15]:
def add_titles(df):
    names = df["Name"]
    title = []

    for n in names:
        if "Mr." in n:
            title.append("MR")
        elif "Mrs." in n or "Mme" in n:
            title.append("MRS")
        elif "Miss." in n or "Mlle" in n or "Ms" in n:
            title.append("MISS")
        elif "Master." in n:
            title.append("MASTER")
        else:
            title.append(np.nan)

    df = df.drop(columns=["Name"])
    df["Title"] = title
    df = pd.get_dummies(df, columns=["Title"])
    return df

train_x = add_titles(train_x)
test_x = add_titles(test_x)

display(train_x[['Title_MASTER', 'Survived']].groupby(['Title_MASTER'], as_index=False).mean())
display(train_x[['Title_MISS', 'Survived']].groupby(['Title_MISS'], as_index=False).mean())
display(train_x[['Title_MR', 'Survived']].groupby(['Title_MR'], as_index=False).mean())
display(train_x[['Title_MRS', 'Survived']].groupby(['Title_MRS'], as_index=False).mean())

Unnamed: 0,Title_MASTER,Survived
0,0,0.374853
1,1,0.575


Unnamed: 0,Title_MISS,Survived
0,0,0.300283
1,1,0.702703


Unnamed: 0,Title_MR,Survived
0,0,0.697861
1,1,0.156673


Unnamed: 0,Title_MRS,Survived
0,0,0.31634
1,1,0.793651


In [16]:
display(train_x)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Title_MASTER,Title_MISS,Title_MR,Title_MRS
0,0,3,0,22.0,1,0,7.2500,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1,38.0,1,0,71.2833,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,3,1,26.0,0,0,7.9250,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1,35.0,1,0,53.1000,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,3,0,35.0,0,0,8.0500,0,0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0,0,1,...,0,0,0,0,0,0,0,0,0,0
887,1,1,1,19.0,0,0,30.0000,0,0,1,...,0,0,0,0,0,0,0,1,0,0
888,0,3,1,,1,2,23.4500,0,0,1,...,0,0,0,0,0,0,0,1,0,0
889,1,1,0,26.0,0,0,30.0000,1,0,0,...,1,0,0,0,0,0,0,0,1,0


In [17]:
# Fill missing values of "Age"
train_columns = list(train_x.columns)
test_columns = list(test_x.columns)

# train set
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(train_x)
train_x = imp.transform(train_x)
train_x = pd.DataFrame(data=train_x, columns=train_columns)

# test set
# before we add the missng columns (if necesssary)
missing_columns = list(set(train_columns) - set(test_columns))
for c in missing_columns:
    if c != "Survived":
        print("missing Column in test: ", c)
        test_columns.append(c)
        values = [0] * len(test_x)
        test_x[c] = values
    
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(test_x)
test_x = imp.transform(test_x)
test_x = pd.DataFrame(data=test_x, columns=test_columns)

missing Column in test:  Cabin_T


In [18]:
print(train_x.shape)
display(train_x)

(891, 22)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Title_MASTER,Title_MISS,Title_MR,Title_MRS
0,0.0,3.0,0.0,22.000000,1.0,0.0,7.2500,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.0,1.0,38.000000,1.0,0.0,71.2833,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,3.0,1.0,26.000000,0.0,0.0,7.9250,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,1.0,1.0,35.000000,1.0,0.0,53.1000,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,3.0,0.0,35.000000,0.0,0.0,8.0500,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,0.0,27.000000,0.0,0.0,13.0000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,1.0,1.0,1.0,19.000000,0.0,0.0,30.0000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
888,0.0,3.0,1.0,17.955321,1.0,2.0,23.4500,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
889,1.0,1.0,0.0,26.000000,0.0,0.0,30.0000,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Training

In [19]:
# remove labels from the training set
train_x = train_x.drop(columns=["Survived"])

# Data scaling 
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)

In [20]:
# Decision Tree
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf.fit(train_x, train_y)
scores = cross_val_score(clf, train_x, train_y, cv=10)
#fig = plt.figure(figsize=(25,20))
#_ = tree.plot_tree(clf, filled=True)
print("Decision Tree Accuracy: ", np.mean(scores))

Decision Tree Accuracy:  0.8115355805243445


In [21]:
# Support Vector Machine
Cs = [200, 150, 100]
gammas = [0.1, 0.05, 0.01, 0.005, 0.001]

best_accuracy = -1
best_c = 0
best_gamma = 0

for c in Cs:
    for gamma in gammas:
        clf = svm.SVC(C = c, gamma = gamma)
        #clf.fit(train_x, train_y)
        scores = cross_val_score(clf, train_x, train_y, cv=10)
        accuracy = np.mean(scores)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_c = c
            best_gamma = gamma
print("SVM Accuracy: ", best_accuracy, "(C=", best_c, ", gamma=", best_gamma, ")")

SVM Accuracy:  0.828302122347066 (C= 200 , gamma= 0.005 )


In [22]:
# Random Forest
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
clf.fit(train_x, train_y)
scores = cross_val_score(clf, train_x, train_y, cv=10)
print("Random Forest Accuracy: ", np.mean(scores))

Random Forest Accuracy:  0.8283021223470662


In [23]:
# KNN
weights = ["uniform", "distance"]
Ks = [3, 5, 7, 9]

best_accuracy = -1
best_weight = 0
best_k = 0

for w in weights:
    for k in Ks:
        neigh = KNeighborsClassifier(n_neighbors = k, weights = w)
        scores = cross_val_score(clf, train_x, train_y, cv=10)
        accuracy = np.mean(scores)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_weight = w
            best_k = k
print("KNN Accuracy: ", best_accuracy, "(K=", best_k, ", weight=", best_weight, ")")

KNN Accuracy:  0.8283021223470662 (K= 3 , weight= uniform )


### Test Evaluation

In [24]:
# test set scaling
test_x = scaler.transform(test_x)

# labl prediction on test set using Ranodom Forest
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
clf.fit(train_x, train_y)
predictions = clf.predict(test_x)

output_df = pd.DataFrame()
output_df["PassengerId"] = test_ids
output_df["Survived"] = predictions

output_df.to_csv('../outputs/output_v1.csv', index=False)  