In [114]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

In [115]:

raw_train = pd.read_csv('../train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('../test.csv', index_col=0)
raw_test['is_test'] = 1


In [116]:
all_data = pd.concat((raw_train, raw_test), axis=0)

In [117]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [118]:
proc_data.head()
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [119]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [120]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=200, verbose=2)

Epoch 1/200
 - 1s - loss: 544.0146
Epoch 2/200
 - 0s - loss: 231.9199
Epoch 3/200
 - 0s - loss: 217.0381
Epoch 4/200
 - 0s - loss: 199.1561
Epoch 5/200
 - 0s - loss: 191.6565
Epoch 6/200
 - 0s - loss: 184.6117
Epoch 7/200
 - 0s - loss: 175.0564
Epoch 8/200
 - 0s - loss: 167.1187
Epoch 9/200
 - 0s - loss: 174.8353
Epoch 10/200
 - 0s - loss: 152.0505
Epoch 11/200
 - 0s - loss: 170.4900
Epoch 12/200
 - 0s - loss: 155.5197
Epoch 13/200
 - 0s - loss: 143.4313
Epoch 14/200
 - 0s - loss: 147.4448
Epoch 15/200
 - 0s - loss: 145.4654
Epoch 16/200
 - 0s - loss: 143.1226
Epoch 17/200
 - 0s - loss: 145.0243
Epoch 18/200
 - 0s - loss: 144.0261
Epoch 19/200
 - 0s - loss: 143.8194
Epoch 20/200
 - 0s - loss: 144.6121
Epoch 21/200
 - 0s - loss: 132.0100
Epoch 22/200
 - 0s - loss: 135.4638
Epoch 23/200
 - 0s - loss: 141.8305
Epoch 24/200
 - 0s - loss: 131.5050
Epoch 25/200
 - 0s - loss: 130.6822
Epoch 26/200
 - 0s - loss: 132.4122
Epoch 27/200
 - 0s - loss: 131.2360
Epoch 28/200
 - 0s - loss: 123.5163
E

<keras.callbacks.History at 0x1a2f337c18>

In [121]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,,0,1,1.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [122]:
np.shape(train_data['Age'].loc[train_data['Age'].isnull()].reshape(177,1))

  """Entry point for launching an IPython kernel.


(177, 1)

In [130]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p.reshape(177,)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [132]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p.reshape(86,)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [133]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [136]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [137]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [138]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [139]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
 - 2s - loss: 0.6788 - acc: 0.6139
Epoch 2/500
 - 0s - loss: 0.6672 - acc: 0.6162
Epoch 3/500
 - 0s - loss: 0.6662 - acc: 0.6162
Epoch 4/500
 - 0s - loss: 0.6304 - acc: 0.6162
Epoch 5/500
 - 0s - loss: 0.5658 - acc: 0.7104
Epoch 6/500
 - 0s - loss: 0.5969 - acc: 0.7868
Epoch 7/500
 - 0s - loss: 0.6569 - acc: 0.6588
Epoch 8/500
 - 0s - loss: 0.5399 - acc: 0.7767
Epoch 9/500
 - 0s - loss: 0.5266 - acc: 0.8092
Epoch 10/500
 - 0s - loss: 0.4835 - acc: 0.8238
Epoch 11/500
 - 0s - loss: 0.4704 - acc: 0.8418
Epoch 12/500
 - 0s - loss: 0.4622 - acc: 0.8283
Epoch 13/500
 - 0s - loss: 0.4431 - acc: 0.8350
Epoch 14/500
 - 0s - loss: 0.4603 - acc: 0.8485
Epoch 15/500
 - 0s - loss: 0.4607 - acc: 0.8474
Epoch 16/500
 - 0s - loss: 0.4735 - acc: 0.8328
Epoch 17/500
 - 0s - loss: 0.4725 - acc: 0.8249
Epoch 18/500
 - 0s - loss: 0.5209 - acc: 0.8002
Epoch 19/500
 - 0s - loss: 0.4912 - acc: 0.8294
Epoch 20/500
 - 0s - loss: 0.4524 - acc: 0.8462
Epoch 21/500
 - 0s - loss: 0.4626 - acc: 0.8597
E

Epoch 171/500
 - 0s - loss: 0.3464 - acc: 0.8788
Epoch 172/500
 - 0s - loss: 0.3126 - acc: 0.8911
Epoch 173/500
 - 0s - loss: 0.3110 - acc: 0.8900
Epoch 174/500
 - 0s - loss: 0.3199 - acc: 0.8878
Epoch 175/500
 - 0s - loss: 0.3308 - acc: 0.8822
Epoch 176/500
 - 0s - loss: 0.3119 - acc: 0.8911
Epoch 177/500
 - 0s - loss: 0.3112 - acc: 0.8878
Epoch 178/500
 - 0s - loss: 0.3080 - acc: 0.8878
Epoch 179/500
 - 0s - loss: 0.3126 - acc: 0.8900
Epoch 180/500
 - 0s - loss: 0.2956 - acc: 0.8923
Epoch 181/500
 - 0s - loss: 0.3051 - acc: 0.8990
Epoch 182/500
 - 0s - loss: 0.3236 - acc: 0.8799
Epoch 183/500
 - 0s - loss: 0.3158 - acc: 0.8777
Epoch 184/500
 - 0s - loss: 0.3596 - acc: 0.8620
Epoch 185/500
 - 0s - loss: 0.3698 - acc: 0.8552
Epoch 186/500
 - 0s - loss: 0.3150 - acc: 0.8889
Epoch 187/500
 - 0s - loss: 0.3555 - acc: 0.8676
Epoch 188/500
 - 0s - loss: 0.3439 - acc: 0.8956
Epoch 189/500
 - 0s - loss: 0.3174 - acc: 0.8934
Epoch 190/500
 - 0s - loss: 0.3002 - acc: 0.8855
Epoch 191/500
 - 0s 

 - 0s - loss: 0.2878 - acc: 0.8967
Epoch 339/500
 - 0s - loss: 0.2774 - acc: 0.8844
Epoch 340/500
 - 0s - loss: 0.2816 - acc: 0.8967
Epoch 341/500
 - 0s - loss: 0.2837 - acc: 0.8934
Epoch 342/500
 - 0s - loss: 0.2842 - acc: 0.8900
Epoch 343/500
 - 0s - loss: 0.2775 - acc: 0.8956
Epoch 344/500
 - 0s - loss: 0.2850 - acc: 0.8833
Epoch 345/500
 - 0s - loss: 0.2863 - acc: 0.8844
Epoch 346/500
 - 0s - loss: 0.2942 - acc: 0.8844
Epoch 347/500
 - 0s - loss: 0.2880 - acc: 0.8956
Epoch 348/500
 - 0s - loss: 0.2889 - acc: 0.8822
Epoch 349/500
 - 0s - loss: 0.2805 - acc: 0.8900
Epoch 350/500
 - 0s - loss: 0.2972 - acc: 0.8878
Epoch 351/500
 - 0s - loss: 0.2803 - acc: 0.8911
Epoch 352/500
 - 0s - loss: 0.2804 - acc: 0.8956
Epoch 353/500
 - 0s - loss: 0.2914 - acc: 0.8923
Epoch 354/500
 - 0s - loss: 0.2817 - acc: 0.8900
Epoch 355/500
 - 0s - loss: 0.3029 - acc: 0.8855
Epoch 356/500
 - 0s - loss: 0.2941 - acc: 0.8878
Epoch 357/500
 - 0s - loss: 0.2875 - acc: 0.8878
Epoch 358/500
 - 0s - loss: 0.2873

<keras.callbacks.History at 0x1a310e8908>

In [140]:
test_data.columns

Index(['Age', 'Parch', 'SibSp', 'Survived', 'is_test', 'Title_Capt',
       'Title_Col', 'Title_Don', 'Title_Dona', 'Title_Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [141]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)
