# Titanic

## Prepare the notebook

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import backend as K

sess = tf.InteractiveSession()
K.set_session(sess)

Using TensorFlow backend.


In [3]:
def save_result(filename, predictions):
    with open('output/titanic_{}.csv'.format(filename), 'w') as fd:
        print('PassengerId,Survived', file=fd)
        for passenger, survived in predictions:
            print("{},{}".format(passenger, int(survived)), file=fd)

## Load the data

The data should be downloaded from the following links and placed in the `./datasets` folder with the given name:

- [titanic_train.csv](https://www.kaggle.com/c/titanic/download/train.csv)
- [titanic_test.csv](https://www.kaggle.com/c/titanic/download/test.csv)

Read the data:

In [171]:
train_csv = pd.read_csv('./datasets/titanic_train.csv')
test_csv = pd.read_csv('./datasets/titanic_test.csv')
all_csv = pd.concat([train_csv, test_csv])
data = all_csv.copy()

Function to split train set into a train and validation set:

In [113]:
def split_train_validation_test(data, valid_ration=0.2):
    # Split train/test
    train = data.iloc[[not pd.isnull(v) for v in data['Survived']]].copy()
    test = data.iloc[[pd.isnull(v) for v in data['Survived']]].copy()
    # Split train/validation
    valid_ids = np.random.randint(0, len(train), int(len(train) * valid_ration))
    train = data.iloc[[i for i in range(len(train)) if i not in valid_ids]].copy()
    valid = data.iloc[valid_ids].copy()
    # Split data/labels
    train_data = train.ix[:, train.columns != 'Survived'].copy()
    train_labels = train[['Survived']].as_matrix().ravel()
    valid_data = valid.ix[:, train.columns != 'Survived'].copy()
    valid_labels = valid[['Survived']].as_matrix().ravel()
    # Remove label from test data
    test_data = test.drop('Survived', 1)
    return train_data, train_labels, valid_data, valid_labels, test_data

In [114]:
data[:10]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
5,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0.0,330877
6,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0.0,17463
7,2.0,,S,21.075,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0.0,349909
8,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1.0,347742
9,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1.0,237736


## Preprocessing

Extract manually features:

In [172]:
import re

data['LastName'] = [v.split(',')[0] for v in data['Name']]
data['CabinLetter'] = [re.sub(r'\d', '', v) if not pd.isnull(v) else np.nan for v in data['Cabin']]
data['CabinNumber'] = [int('0'+re.sub(r'\D', '', v)) if not pd.isnull(v) else np.nan for v in data['Cabin']]

Remove specific columns:

In [173]:
columns_to_remove = ['PassengerId', 'Name']
for column in columns_to_remove:
    if column in data:
        data.drop(column, 1, inplace=True)

In [174]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['Sex', 'Embarked', 'Ticket', 'Cabin', 'CabinLetter', 'LastName']

for column in columns_to_encode:
    # Prepare a label encoder
    le = LabelEncoder()
    # Replace NaN by string equivalent
    data[column].fillna('NAN', inplace=True)
    # Fit the label encoder
    le.fit([v for v in data[column] if v])
    # Replace the column
    data[column] = le.transform(data[column])

In [175]:
columns_to_onehot = ['Sex', 'Ticket', 'LastName', 'CabinLetter', 'CabinNumber', 'Embarked']

for column in columns_to_onehot:
    onehot = pd.get_dummies(data[column], column)
    data = pd.concat([data, onehot], axis=1, join_axes=[data.index])
    data.drop(column, axis=1)

In [176]:
data[:10]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Columns: 1955 entries, Age to Embarked_3
dtypes: float64(4), int64(9), uint8(1942)
memory usage: 2.6 MB


In [177]:
train_data, train_labels, valid_data, valid_labels, test_data = split_train_validation_test(data)

In [178]:
from sklearn.preprocessing import Imputer

imputer = Imputer()
imputer.fit(train_data)
imputer.fit(valid_data)
X = imputer.transform(train_data)
V = imputer.transform(valid_data)
T = imputer.transform(test_data)

## Machine learning models

### Random Forest

Build and train the model:

In [179]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Evaluate the model:

In [180]:
from sklearn.metrics import accuracy_score

pred = rf.predict(V)
accuracy_score(valid_labels, pred)

0.8202247191011236

Train with validation, predict test set and save results:

In [181]:
rf.fit(V, valid_labels)
pred = rf.predict(T)
save_result('random_forest', zip(test_csv['PassengerId'], pred))

### AdaBoost

In [182]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X, train_labels)
pred = ada.predict(V)
accuracy_score(valid_labels, pred)

0.8370786516853933

In [183]:
ada.fit(V, valid_labels)
pred = ada.predict(T)
save_result('adaboost', zip(test_csv['PassengerId'], pred))

### Neural Network

In [184]:
y_ = pd.get_dummies(train_labels).as_matrix()
yv_ = pd.get_dummies(valid_labels).as_matrix()

In [185]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.normalization import BatchNormalization

model = Sequential()
model.add(BatchNormalization(input_shape=(1953,)))
model.add(Dense(1024))
model.add(BatchNormalization())
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Dense(2))
model.add(Activation('softmax'))

In [186]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
batchnormalization_13 (BatchNorm (None, 1953)          7812        batchnormalization_input_5[0][0] 
____________________________________________________________________________________________________
dense_13 (Dense)                 (None, 1024)          2000896     batchnormalization_13[0][0]      
____________________________________________________________________________________________________
batchnormalization_14 (BatchNorm (None, 1024)          4096        dense_13[0][0]                   
____________________________________________________________________________________________________
dense_14 (Dense)                 (None, 256)           262400      batchnormalization_14[0][0]      
___________________________________________________________________________________________

In [187]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [188]:
with tf.device('/gpu:0'):
    model.fit(X, y_, validation_data=(V, yv_))

ValueError: Error when checking model input: expected batchnormalization_input_5 to have shape (None, 1953) but got array with shape (731, 1954)

In [None]:
with tf.device('/gpu:0'):
    pred = model.predict_classes(V)
    
print(accuracy_score(pred, valid_labels))

In [None]:
with tf.device('/gpu:0'):
    model.fit(V, yv_)
    pred = model.predict_classes(T)
    
save_result('ann', zip(test_csv['PassengerId'], pred))

### Decision tree

In [210]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

dtc = DecisionTreeClassifier()

parameter_grid = {
                 'criterion': ['gini','entropy'],
                 'splitter': ['best','random'],
                 'max_features': [None, 'auto', 'sqrt', 'log2'],
                 }

cross_validation = StratifiedKFold(train_labels, n_folds=5)

grid_search = GridSearchCV(dtc,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(X, train_labels)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.8166894664842681
Best parameters: {'splitter': 'best', 'criterion': 'gini', 'max_features': None}


In [204]:
model.fit(X, train_labels) 
pred = model.predict(V)
accuracy_score(valid_labels, pred)

0.8539325842696629

In [205]:
model.fit(V, valid_labels)
pred = ada.predict(T)
save_result('last', zip(test_csv['PassengerId'], pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')