# Predict the Outcome of Titanic Survivors
### Author: Tiernan Lindauer

In [26]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [27]:
def norm(data):
    return data / max(data)

In [28]:
titanic_data: DataFrame = pd.read_csv("data/train.csv")
titanic_data.drop(labels=["PassengerId", "Name", "Cabin", "Ticket"], axis=1, inplace=True)

More efficient way to encode `Sex` and `Embarked` columns

In [29]:
ordinal_encoder = OrdinalEncoder()
sex_cat = titanic_data[["Sex"]]
sex_cat_encoded = ordinal_encoder.fit_transform(sex_cat)
titanic_data[["Sex"]] = sex_cat_encoded

titanic_data = pd.get_dummies(titanic_data , columns=['Embarked'] , prefix='Embarked')
titanic_data["Combo"] = titanic_data["Sex"] + norm(titanic_data["Pclass"]) - 0.1*norm(titanic_data["Fare"])

In [30]:
titanic_data.dropna(inplace=True)
titanic_data.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Combo
0,0,3,1.0,22.0,1,0,7.25,0,0,1,1.998585
1,1,1,0.0,38.0,1,0,71.2833,1,0,0,0.31942
2,1,3,0.0,26.0,0,0,7.925,0,0,1,0.998453
3,1,1,0.0,35.0,1,0,53.1,0,0,1,0.322969
4,0,3,1.0,35.0,0,0,8.05,0,0,1,1.998429
6,0,1,1.0,54.0,0,0,51.8625,0,0,1,1.32321
7,0,3,1.0,2.0,3,1,21.075,0,0,1,1.995886
8,1,3,0.0,27.0,0,2,11.1333,0,0,1,0.997827


In [31]:
corr_matrix: DataFrame = titanic_data.corr()
print(corr_matrix["Survived"].sort_values(ascending=False))

Survived      1.000000
Fare          0.268189
Embarked_C    0.193607
Parch         0.093317
SibSp        -0.017358
Embarked_Q   -0.049549
Age          -0.077221
Embarked_S   -0.164235
Pclass       -0.359653
Sex          -0.538826
Combo        -0.607141
Name: Survived, dtype: float64


In [32]:
X = titanic_data.drop("Survived", axis=1, inplace=False)
y = titanic_data['Survived']

In [43]:
classifier = DecisionTreeClassifier(random_state=42, criterion = 'entropy' , max_depth = 8 , min_samples_split=12)
classifier.fit(X, y)

In [36]:
titanic_data_test = pd.read_csv("data/test.csv")
titanic_data_test.drop(labels=["PassengerId", "Name", "Cabin", "Ticket"], axis=1, inplace=True)

ordinal_encoder = OrdinalEncoder()
sex_cat = titanic_data_test[["Sex"]]
sex_cat_encoded = ordinal_encoder.fit_transform(sex_cat)
titanic_data_test[["Sex"]] = sex_cat_encoded

titanic_data_test["Combo"] = titanic_data_test["Sex"] + norm(titanic_data_test["Pclass"]) - 0.1*norm(titanic_data_test["Fare"])
titanic_data_test = pd.get_dummies(titanic_data_test , columns=['Embarked'] , prefix='Embarked')
titanic_data_test.dropna(inplace=True)
titanic_display_test = pd.read_csv("data/test.csv")

In [37]:
survival_prediction = classifier.predict(titanic_data_test)
survival_prediction_dataframe = pd.DataFrame(survival_prediction, columns=['Survived'])
survival_prediction_dataframe.to_csv("DecisionTreeClassifierOutput.csv")

Feature names must be in the same order as they were in fit.



Score the classifier on the training data: Is not a very accurate reflection due to over/underfitting

In [44]:
classifier.score(X, y)

0.8711484593837535