# Porto Seguro’s Safe Driver Prediction: H2O.ai


NB: The only way i managed to make the ensemble works is by grabbing the id of the models from the grid search outputs

## Initializing H2O

In [None]:
import pandas as pd 
import json
import h2o
h2o.init(ip="10.0.21.30", port=54321)
from h2o.automl import H2OAutoML
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
import numpy as np

In [None]:
import pandas as pd
import numpy as np

path = '../input/'
train_data = h2o.import_file(path + 'train.csv')
test_data = h2o.import_file(path + 'test.csv')
test_id = h2o.import_file(path + 'test.csv')['id']
train_data["target"] = train_data["target"].asfactor()
#silly split to be faster
train, valid, test = train_data.split_frame(ratios=[0.1, 0.1], seed=3)  
y = 'target'
x = list(train_data.columns)
x.remove(y)

def plot_perf(grid, test):
    print('best')
    for index, model in enumerate(grid.models):
        print ('model',index)
        print ('id',model.model_id)
        perf_grid = model.model_performance(test)
        print ('gini:',perf_grid.gini())
        print ('auc:',perf_grid.auc())
        print ('--------\n\n')

#for ensamble
nfolds=2

#same random seed or ensemble wont work
#set in all model :'keep_cross_validation_predictions':True, and 'fold_assignment':"Modulo",
seed=1
search_criteria = {"strategy": "RandomDiscrete", "max_models": 2, "seed": seed}


Random Forest

In [None]:
hyper_parameters = {
                    'max_depth':[4],
                    "ntrees":[5],#, 300, 250, 350, 400, 500],
                    "sample_rate": [0.8, 0.7,0.6],
}

params = {
          "balance_classes": True,
          "seed":seed,
          "fold_assignment":"Modulo",
          "nfolds":nfolds,
          "keep_cross_validation_predictions":True,
          "stopping_rounds":10,
          "stopping_metric":'AUC'}
          

grid_3 = H2OGridSearch(H2ORandomForestEstimator(**params
                                   
                                   ), hyper_params=hyper_parameters,
                                    search_criteria=search_criteria,)

grid_3.train(x=x, y=y, training_frame=train,validation_frame=valid)
grid_3.show()
plot_perf(grid_3, valid)
plot_perf(grid_3, test)

NN

In [None]:
hyper_parameters = {#
    'activation':["Maxout"],#["Rectifier","Tanh","Maxout","RectifierWithDropout","TanhWithDropout","MaxoutWithDropout"],
    'epochs': [12],
    'hidden':[ [10, 5] ],
    'input_dropout_ratio': [0.01, 0.05],
    'rate': [0.01, 0.05],
    #'rate_annealing':[1e-9, 1e-8, 1e-7]    
}

params = {
                    
          'score_interval':1,
          'stopping_rounds':5,
          'stopping_metric':'AUC',
          'balance_classes' : True,
          'nfolds':nfolds,
          'seed':seed,
          'keep_cross_validation_predictions':True,
          'fold_assignment':"Modulo",

          }

grid_4 = H2OGridSearch(H2ODeepLearningEstimator(**params),hyper_params=hyper_parameters,
                                    search_criteria=search_criteria)

grid_4.train(x=x, y=y, training_frame=train,validation_frame=valid)
grid_4.show()
plot_perf(grid_4, valid)
plot_perf(grid_4, test)

Linear Model

In [None]:
hyper_parameters = { 'alpha': [0.2, 0.3, 0.4,  0.5, 0.6, 0.7, 0.8], 
                     'lambda': [1e-10, 1e-9, 1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0] }
params = {
          'family':'binomial',
          'nfolds':nfolds,
          'seed':seed, 
          'fold_assignment':"Modulo",
          'keep_cross_validation_predictions':True,
          }
grid_5 = H2OGridSearch( H2OGeneralizedLinearEstimator(**params), hyper_params=hyper_parameters,
                                    search_criteria=search_criteria)
grid_5.train(x=x, y=y, training_frame=train, validation_frame=valid)
grid_5.show()
plot_perf(grid_5, valid)
plot_perf(grid_5, test)

Gradient Boosting

In [None]:
hyper_parameters = {"ntrees":[50],
                "learn_rate": [0.05, 0.01],
                "max_depth": [4],
                "sample_rate": [0.8, 0.9, 0.7],
                "col_sample_rate": [ 0.6, 0.7, 0.8]}

params = {
          "balance_classes": True,
          "seed":seed,
          "fold_assignment":"Modulo",
          "nfolds":nfolds,
          "keep_cross_validation_predictions":True,
          "stopping_tolerance":0.0005,
          "stopping_metric":'AUC',
          "stopping_rounds":10,
          "score_each_iteration":True,
          "score_tree_interval":50          
          }

# Train the grid
grid_1 = H2OGridSearch(model=H2OGradientBoostingEstimator(**params),
                     hyper_params=hyper_parameters,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid_1.train(x=x, y=y, training_frame=train, validation_frame=valid)
grid_1.show()
plot_perf(grid_1, valid)
plot_perf(grid_1, test)

Ensemble

In [None]:
base_models=[grid_1.model_ids[0],grid_3.model_ids[0],grid_4.model_ids[0], grid_5.model_ids[0]]
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_1",base_models=base_models)

ensemble.train(x=x, y=y, training_frame=train)
perf_stack_test = ensemble.model_performance(test)
print (perf_stack_test.gini())
print (perf_stack_test.auc())
test_pred = ensemble.predict(test_data)
submission = pd.concat((h2o.as_list(test_id), h2o.as_list(test_pred['p1'])), axis=1, ignore_index=True)
submission.columns = ['id', 'target']
submission = submission.set_index(['id', 'target'])
submission.to_csv('submission_h2o_gbm.csv_3.gz',compression='gzip')