In [1]:
# Import all the libraries I need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ignore Deprecation Warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import GridSearchCV

import keras 
from keras.models import Sequential # intitialize the ANN
from keras.layers import Dense      # create layers


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [46]:

# load the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = df_train.append(df_test , ignore_index = True)

# some quick inspections
df_train.shape, df_test.shape, df_train.columns.values

((891, 12),
 (418, 11),
 array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object))

## Cleaning

In [47]:
def preprocess(df):
    df['Title'] = df.Name.map( lambda x: x.split(',')[1].split( '.' )[0].strip())

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace(['Mme','Lady','Ms'], 'Mrs')
    df.Title.loc[ (df.Title !=  'Master') & (df.Title !=  'Mr') & (df.Title !=  'Miss') 
                 & (df.Title !=  'Mrs')] = 'Others'


    df = pd.concat([df, pd.get_dummies(df['Title'])], axis=1).drop(labels=['Name'], axis=1)

    # map the two genders to 0 and 1
    df.Sex = df.Sex.map({'male':0, 'female':1})

    # create a new feature "Family"
    df['Family'] = df['SibSp'] + df['Parch'] + 1

    df.Family = df.Family.map(lambda x: 0 if x > 4 else x)

    df.Ticket = df.Ticket.map(lambda x: x[0])

    guess_Fare = df.Fare.loc[ (df.Ticket == '3') & (df.Pclass == 3) & (df.Embarked == 'S')].median()
    df.Fare.fillna(guess_Fare , inplace=True)

    # inspect the mean Fare values for people who died and survived
    df[['Fare', 'Survived']].groupby(['Survived'],as_index=False).mean()

    # bin Fare into five intervals with equal amount of people
    df['Fare-bin'] = pd.qcut(df.Fare,5,labels=[1,2,3,4,5]).astype(int)


    # notice that instead of using Title, we should use its corresponding dummy variables 
    df_sub = df[['Age','Master','Miss','Mr','Mrs','Others','Fare-bin','SibSp']]

    X_train  = df_sub.dropna().drop('Age', axis=1)
    y_train  = df['Age'].dropna()
    X_test = df_sub.loc[np.isnan(df.Age)].drop('Age', axis=1)

    regressor = RandomForestRegressor(n_estimators = 300)
    regressor.fit(X_train, y_train)
    y_pred = np.round(regressor.predict(X_test),1)
    df.Age.loc[df.Age.isnull()] = y_pred

    bins = [ 0, 4, 12, 18, 30, 50, 65, 100] # This is somewhat arbitrary...
    age_index = (1,2,3,4,5,6,7)
    #('baby','child','teenager','young','mid-age','over-50','senior')
    df['Age-bin'] = pd.cut(df.Age, bins, labels=age_index).astype(int)

    df['Ticket'] = df['Ticket'].replace(['A','W','F','L','5','6','7','8','9'], '4')

    df = df.drop(labels=['Cabin'], axis=1)

    # fill the NAN
    df.Embarked.fillna('S' , inplace=True )

    df = df.drop(labels='Embarked', axis=1)

    # dummy encoding
    df = pd.get_dummies(df,columns=['Ticket'])

    df = df.drop(labels=['SibSp','Parch','Age','Fare','Title'], axis=1)
    return df

In [64]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Master,Miss,Mr,Mrs,Others,Family,Fare-bin,Age-bin,Ticket_1,Ticket_2,Ticket_3,Ticket_4,Ticket_C,Ticket_P,Ticket_S
0,1,0.0,3,0,0,0,1,0,0,2,1,4,0,0,0,1,0,0,0
1,2,1.0,1,1,0,0,0,1,0,2,5,5,0,0,0,0,0,1,0
2,3,1.0,3,1,0,1,0,0,0,1,2,4,0,0,0,0,0,0,1
3,4,1.0,1,1,0,0,0,1,0,2,5,5,1,0,0,0,0,0,0
4,5,0.0,3,0,0,0,1,0,0,1,2,5,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,0,0,0,1,0,0,1,2,4,0,0,0,1,0,0,0
1305,1306,,1,1,0,0,0,0,1,1,5,5,0,0,0,0,0,1,0
1306,1307,,3,0,0,0,1,0,0,1,1,5,0,0,0,0,0,0,1
1307,1308,,3,0,0,0,1,0,0,1,2,4,0,0,1,0,0,0,0


In [33]:
df.to_csv('fullData_clean.csv')
len(df.columns)

19

## Modeling and Prediction
Now we can drop the features we don't need and split the data into training and test sets

In [None]:

y_train = df[0:891]['Survived'].values
X_train = df[0:891].drop(['Survived','PassengerId'], axis=1).values
X_test  = df[891:].drop(['Survived','PassengerId'], axis=1).values

(09/12/2017 update) Using NN gives better result than XGBoost and Random Forest do. 

In [None]:
# Initialising the NN
def getModel():
    model = Sequential()

    # layers
    model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu', input_dim = 17))
    model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))
    model.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    # Compiling the ANN
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

# Train the ANN
model.fit(X_train, y_train, batch_size = 32, epochs = 200)

We can now get the prediction. I got a public score of 0.81339 using the output from my laptop (python 2.7), which is different from what is generated here.

In [None]:
y_pred = model.predict(X_test)
y_final = (y_pred > 0.5).astype(int).reshape(X_test.shape[0])

output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_final})
output.to_csv('prediction-ann.csv', index=False)