# Titanic Survival Prediction

## 1 &ensp; Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

import xgboost as xgb


## 2 &ensp; EDA

Read data

In [2]:
df = pd.read_csv("data/train.csv")

print(df.shape)
print(df.columns)
df.head()

(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Split data into features & classes

In [3]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

print(X.shape)
print(y.shape)

(891, 11)
(891,)


## 3 &ensp; Preprocess

Break categorical features down to binary variables

In [4]:
def encode_categorical(X, features):
    encoder = OneHotEncoder(handle_unknown='ignore')

    for feature, sub_columns in features.items():
        X = X.join(pd.DataFrame(data=encoder.fit_transform(X[[feature]]).toarray(), 
                                columns=sub_columns))

    return X.drop(labels=features.keys(), axis=1)

Scale continuous features

In [5]:
def scale_continuous(X, features):
    scaler = StandardScaler()
    X[features] = scaler.fit_transform(X[features])

    return X

Drop irrelevant columns

In [7]:
def drop_irrelevant(X, features):
    return X.drop(columns=features)

Preprocess data

In [8]:
irrelevant_features = ["Age", "Name", "Cabin", "PassengerId", "Embarked", "Ticket"]
continuous_features = ["SibSp", "Parch", "Fare"]

categorical_features = {"Pclass": ["c1", "c2", "c3"], 
                        "Sex": ["Female", "Male"]}

def preprocess_data(X):
    X = encode_categorical(X, categorical_features)
    X = scale_continuous(X, continuous_features)
    X = drop_irrelevant(X, irrelevant_features)

    return X

X = preprocess_data(X)

View preprocessed DATA

In [9]:
X.head(10)

Unnamed: 0,SibSp,Parch,Fare,c1,c2,c3,Female,Male
0,0.432793,-0.473674,-0.502445,0.0,0.0,1.0,0.0,1.0
1,0.432793,-0.473674,0.786845,1.0,0.0,0.0,1.0,0.0
2,-0.474545,-0.473674,-0.488854,0.0,0.0,1.0,1.0,0.0
3,0.432793,-0.473674,0.42073,1.0,0.0,0.0,1.0,0.0
4,-0.474545,-0.473674,-0.486337,0.0,0.0,1.0,0.0,1.0
5,-0.474545,-0.473674,-0.478116,0.0,0.0,1.0,0.0,1.0
6,-0.474545,-0.473674,0.395814,1.0,0.0,0.0,0.0,1.0
7,2.24747,0.76763,-0.224083,0.0,0.0,1.0,0.0,1.0
8,-0.474545,2.008933,-0.424256,0.0,0.0,1.0,1.0,0.0
9,0.432793,-0.473674,-0.042956,0.0,1.0,0.0,1.0,0.0


Split data into train set and validation set

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 4 &ensp; MODEL

Decision tree

In [13]:
tree_model = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)

tree_model.score(X_val, y_val)

0.8100558659217877

XGBoost

In [17]:
xgb_model = xgb.XGBClassifier().fit(X_train, y_train)

xgb_model.score(X_val, y_val)

0.8212290502793296

## 5 &ensp; Evaluation

In [24]:
X_test = pd.read_csv('data/test.csv')

pd.DataFrame({
    "PassengerId": X_test["PassengerId"],
    "Survived": xgb_model.predict(preprocess_data(X_test))
}).to_csv("data/submission.csv", index=False)