# h2o auto ml example- titanic data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os

In [3]:
# Input data files are available in the "data/" directory.
print(os.listdir("data"))

# Any results you write to the current directory are saved as output.

['titanic_test.csv', 'submission.csv', 'titanic_train.csv']


In [4]:
#data cleaning and feature engineering 
def get_name_prefix(data):
    prefix = pd.Series(np.ones(data.shape[0]), index=data.index)
    data['Prefix'] = prefix
    data.loc[data.Name.str.contains('Miss.', regex=False), 'Prefix'] = 2
    data.loc[data.Name.str.contains('Mrs.', regex=False), 'Prefix'] = 3
    data.loc[data.Name.str.contains('Mr.', regex=False), 'Prefix'] = 4

In [5]:
# https://stackoverflow.com/a/42523230
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

In [6]:
def normalize(df, mean, std):
    """
    @param df pandas DataFrame
    @param mean pandas Series of column values mean
    @param std pandas Series of column values standard deviation
    """
    for i in range(mean.size):
        df[mean.index[i]] = (df[mean.index[i]] - mean[0]) / std[0] 

In [7]:
def process_data(data):
    # get prefix data
    get_name_prefix(data)
    # remove name and ticket
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    # sex
    data.loc[data.Sex != 'male', 'Sex'] = 0;
    data.loc[data.Sex == 'male', 'Sex'] = 1;
    # cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # embarked
    data.Embarked.fillna(0, inplace=True)
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.fillna(-1, inplace=True)
    
    data = one_hot(data, ('Pclass', 'Sex', 'Cabin', 'Embarked', 'Prefix'))
    return data.astype(float)

In [10]:
#load data
train_raw = pd.read_csv('data/titanic_train.csv')
test_raw = pd.read_csv('data/titanic_test.csv')

train = process_data(train_raw)
test = process_data(test_raw)

data_mean = train[['Age','Fare','SibSp','Parch']].mean(axis=0)
data_std = train[['Age','Fare','SibSp','Parch']].std(axis=0)

normalize(train, data_mean, data_std)
normalize(test, data_mean, data_std)

test, train = test.align(train, axis=1, fill_value=0)

In [12]:
test.head()

Unnamed: 0,Age,Cabin_0,Cabin_1,Cabin_2,Cabin_3,Cabin_4,Cabin_5,Cabin_6,Cabin_7,Cabin_8,...,Pclass_2,Pclass_3,Prefix_1.0,Prefix_2.0,Prefix_3.0,Prefix_4.0,Sex_0,Sex_1,SibSp,Survived
0,0.61001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.32087,0
1,1.309605,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.264902,0
2,2.149118,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.32087,0
3,0.190254,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.32087,0
4,-0.089584,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.264902,0


In [13]:
train.head()

Unnamed: 0,Age,Cabin_0,Cabin_1,Cabin_2,Cabin_3,Cabin_4,Cabin_5,Cabin_6,Cabin_7,Cabin_8,...,Pclass_2,Pclass_3,Prefix_1.0,Prefix_2.0,Prefix_3.0,Prefix_4.0,Sex_0,Sex_1,SibSp,Survived
0,-0.089584,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.264902,0.0
1,0.805897,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.264902,1.0
2,0.134286,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.32087,1.0
3,0.637994,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.264902,1.0
4,0.637994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.32087,0.0


In [14]:
#start H2O 
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,7 mins 45 secs
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,11 days
H2O cluster name:,H2O_from_python_nishantgautam_prr15k
H2O cluster total nodes:,1
H2O cluster free memory:,3.844 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [15]:
#load data as h2o frames
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
#drop passengerId from data set
passId = test['PassengerId']
train = train.drop('PassengerId',axis =1)
test = test.drop('PassengerId',axis =1)

In [17]:
#identify predictors and labels
x = train.columns
y = 'Survived'
x.remove(y)

#for binary classification, lables should be a factor
train[y] = train[y].asfactor()

In [21]:
# Run AutoML
aml_ti = H2OAutoML(max_runtime_secs= 120,max_models= 10, seed= 7,nfolds= 10)
aml_ti.train(x = x, y = y,
          training_frame = train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [25]:
aml_ti.leaderboard

model_id,auc,logloss,mean_per_class_error,rmse,mse
XGBoost_2_AutoML_20190206_094457,0.872892,0.417868,0.178291,0.360345,0.129849
GBM_1_AutoML_20190206_094457,0.872032,0.413346,0.179968,0.356766,0.127282
XGBoost_1_AutoML_20190206_094421,0.871726,0.408789,0.178099,0.354974,0.126007
XGBoost_1_AutoML_20190206_094457,0.871726,0.408789,0.178099,0.354974,0.126007
GBM_4_AutoML_20190206_094457,0.867606,0.418072,0.178027,0.356861,0.12735
GBM_2_AutoML_20190206_094457,0.867108,0.41623,0.180376,0.35647,0.127071
XGBoost_3_AutoML_20190206_094457,0.865297,0.432741,0.198495,0.368463,0.135765
GBM_3_AutoML_20190206_094457,0.865223,0.420427,0.177644,0.357952,0.128129
GLM_grid_1_AutoML_20190206_094421_model_1,0.856344,0.442253,0.202449,0.373309,0.13936
GLM_grid_1_AutoML_20190206_094457_model_1,0.856344,0.442253,0.202449,0.373309,0.13936




In [26]:
aml_ti.leader.predict(test)

xgboost prediction progress: |████████████████████████████████████████████| 100%


predict,p0,p1
0,0.86949,0.13051
0,0.554862,0.445138
0,0.889299,0.110701
0,0.812071,0.187929
1,0.458133,0.541867
0,0.872917,0.127083
1,0.441523,0.558477
0,0.824695,0.175305
1,0.265742,0.734258
0,0.907703,0.0922972




In [23]:
#check the leaderboard
lb_ti = aml_ti.leaderboard
lb_ti

#prediction
pred = aml_ti.leader.predict(test)

#save predict results to submission form
pred_df = pred.as_data_frame()
pred_res = pred_df.predict
passId_df = passId.as_data_frame()
res_ti = pd.concat([passId_df,pred_res],axis=1,ignore_index = True)
res_ti.columns = ['PassengerId','Survived']
res_ti.to_csv('mypred.csv',index=False)

#http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

xgboost prediction progress: |████████████████████████████████████████████| 100%
