# Load data

pip install pandas  
pip install jupyterlab  
pip install plotly  
pip install seaborn  
pip install scikit-learn  

In [None]:
# Pandas >= 1.0 required
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
from matplotlib import cm

In [None]:
train_data = pd.read_csv('../data/titanic/train.csv',
                        dtype={'Name':'string',
                              'Sex':'string',
                              'Ticket':'string',
                              'Cabin':'string',
                              'Embarked':'string',
                              }) 
train_data.dtypes

In [None]:
test_data = pd.read_csv('../data/titanic/test.csv',
                        dtype={'Name':'string',
                              'Sex':'string',
                              'Ticket':'string',
                              'Cabin':'string',
                              'Embarked':'string',
                              }) 
test_data.dtypes

Interesting fact: Wikipedia says that RMS Titanic also had about 900 crew.

In [None]:
# proportion who survived in train data
survived = train_data[train_data['Survived']==True]
perished = train_data[train_data['Survived']==False]
len(survived)/(len(survived)+len(perished))

# Some stats

In [None]:
# Missing data in the training set
train_data.isnull().sum(axis = 0)

In [None]:
# Missing data in the test set
test_data.isnull().sum(axis = 0)

# Data manipulation

In [None]:
# The median Age of train data is 28.0
# The median Age of test data is 27.0
# Replace missing ages with the median of the data set
train_data = train_data.fillna(value={'Age':28.0})
test_data = test_data.fillna(value={'Age':27.0})

In [None]:
# Fill the missing origins with the most frequent origin 'S'
train_data = train_data.fillna(value={'Embarked':'S'})

In [None]:
# Fill the missing fare with the median fare in test data: 14.45
test_data = test_data.fillna(value={'Fare':14.45})

In [None]:
train_data['Cabin'] = train_data['Cabin'].apply(lambda s: 'X' if pd.isna(s) else s[0])
test_data['Cabin'] = test_data['Cabin'].apply(lambda s: 'X' if pd.isna(s) else s[0])

In [None]:
# Missing data in the training set
train_data.isnull().sum(axis = 0)

In [None]:
# Missing data in the test set
test_data.isnull().sum(axis = 0)

## Pclass

In [None]:
fig = px.pie(train_data, names='Pclass', title='Training data Pclass')
fig.show()

In [None]:
fig = px.pie(test_data, names='Pclass', title='Testing data Pclass')
fig.show()

Distribution of Pclass in training and testing data is similar so it makes sense to use it as a feature

## Sex

In [None]:
fig = px.pie(train_data, names='Sex', title='Training data Sex')
fig.show()

In [None]:
fig = px.pie(test_data, names='Sex', title='Testing data Sex')
fig.show()

Distribution of Sex in training and testing data is similar so it makes sense to use it as a feature

## Age

In [None]:
plt.figure()
sns.distplot(train_data['Age'])

In [None]:
plt.figure()
sns.distplot(test_data['Age'])

## # of siblings / spouses aboard the Titanic

In [None]:
plt.figure()
sns.distplot(train_data['SibSp'])

In [None]:
plt.figure()
sns.distplot(test_data['SibSp'])

## # of parents / children aboard the Titanic

In [None]:
plt.figure()
sns.distplot(train_data['Parch'])

In [None]:
plt.figure()
sns.distplot(test_data['Parch'])

## Cabin

In [None]:
fig = px.pie(train_data, names='Cabin', title='Training data Cabin')
fig.show()

In [None]:
fig = px.pie(test_data, names='Cabin', title='Test data Cabin')
fig.show()

## Split train_data to train and validation sets for learning

In [None]:
# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Cabin']], 
    train_data[['Survived']], 
    random_state=0,
    test_size=0.1)

In [None]:
# This plot is not particularly useful
#g = sns.pairplot(pd.concat([X_train,y_train],axis=1), hue='Survived', palette="husl")

In [None]:
# Pairplot for the survived
# This plot is not particularly useful
#c = pd.concat([X_train,y_train],axis=1)
#g = sns.pairplot(c[c.Survived==True])

In [None]:
# Pairplot for the perished
# This plot is not particularly useful
#c = pd.concat([X_train,y_train],axis=1)
#g = sns.pairplot(c[c.Survived==False])

In [None]:
from sklearn import preprocessing
enc = preprocessing.OrdinalEncoder()
enc.fit(X_train[['Sex','Cabin']])

In [None]:
X_train_enc = X_train.copy()

In [None]:
X_train_enc[['Sex','Cabin']] = enc.transform(X_train_enc[['Sex','Cabin']])

In [None]:
X_test_enc = X_test.copy()

In [None]:
X_test_enc[['Sex','Cabin']] = enc.transform(X_test_enc[['Sex','Cabin']])

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [None]:
clf.fit(X_train_enc, y_train)

In [None]:
clf.score(X_test_enc, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=4)

In [None]:
clf.fit(X_train_enc, np.ravel(y_train))

In [None]:
clf.score(X_test_enc, np.ravel(y_test))

## Train it using the whole dataset

In [None]:
X_train_final = train_data[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Cabin']]
y_train_final = train_data[['Survived']]

In [None]:
enc = preprocessing.OrdinalEncoder()
enc.fit(X_train_final[['Sex','Cabin']])

In [None]:
X_train_final_enc = X_train_final.copy()

In [None]:
X_train_final_enc[['Sex','Cabin']] = enc.transform(X_train_final_enc[['Sex','Cabin']])

In [None]:
clf = RandomForestClassifier(n_estimators=10)

In [None]:
clf.fit(X_train_final_enc, np.ravel(y_train_final))

In [None]:
X_test_final = test_data[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Cabin']]

In [None]:
X_test_final_enc = X_test_final.copy()

In [None]:
X_test_final_enc[['Sex','Cabin']] = enc.transform(X_test_final_enc[['Sex','Cabin']])

In [None]:
pd.Series(clf.predict(X_test_final_enc))

In [None]:
res = pd.concat([test_data[['PassengerId']],
           pd.Series(clf.predict(X_test_final_enc))],
          axis=1)

In [None]:
res.columns = ['PassengerId','Survived']

In [None]:
res

In [None]:
res.to_csv('titanic_out.csv', index=False)