https://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import h2o

In [None]:
df = pd.read_csv("/work/jupyterhub/shared/benoit/data_h2o.csv")
df.head()

In [None]:
df["Legendary"].value_counts().plot(kind="bar")

# Preparation

In [None]:
h2o.init(nthreads = 4, port = 54341)

In [None]:
df_h2o = h2o.H2OFrame(df) # transform into h2o df from pandas

train, test = df_h2o.split_frame(ratios=[0.7], seed=42)


print("{} rows in training set".format(train.shape[0]))
print("{} rows in testing set".format(test.shape[0]))

y = "Legendary"
features = train.columns

features.remove("num")
features.remove("Name")
features.remove(y)

# Modeling

In [None]:
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator

gs_params1 = {'ntrees': [20,50,100,150],
                'max_depth': [3, 5, 7, 9],
                'sample_rate': [0.5, 0.8, 1.0]
             }

search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 5, 'seed': 42}

rf_grid1 = H2OGridSearch(model = H2ORandomForestEstimator,
                          grid_id = 'rf_grid1',
                          hyper_params = gs_params1,
                          search_criteria = search_criteria)

rf_grid1.train(x = features, 
               y = y,
               training_frame = train,
#                validation_frame = valid,
               seed=42,
               nfolds = 5);

You can relaunch it with the same grid_id to continue the GS

In [None]:
perfs = rf_grid1.get_grid(sort_by='auc', decreasing=True)
display(perfs)

In [None]:
model = rf_grid1.models[0]
model.summary()

In [None]:
perf = model.model_performance(test)
perf

In [None]:
print("Training AUC : {:.1%}".format(model.auc()))
print("Cross val AUC : {:.1%}".format(model.auc(xval=True)))
print("Testing AUC : {:.1%}".format(perf.auc()))

In [None]:
perf.plot(type="roc")

In [None]:
gains_lifts = perf.gains_lift()
pandas_gl = gains_lifts.as_data_frame()

plt.figure(figsize=(10,6))   
plt.plot(pandas_gl.cumulative_data_fraction, pandas_gl.cumulative_lift, label="model")
plt.plot([0,1], [1,1], 'k-')
plt.title("Cumulative lift curve")
plt.xlabel("Cumulative data fraction")
plt.ylabel("Cumulative Lift")
plt.ylim([0,14])
plt.show()

# AutoML

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

In [None]:
from h2o.automl import H2OAutoML

model = H2OAutoML(nfolds = 3,
                      balance_classes=True, 
                      max_runtime_secs=300, 
                      max_models=10, 
                      stopping_metric="AUC", 
                      stopping_rounds=3, 
                      seed = 42)
    
model.train(x=features, 
      y=y, 
      training_frame=train)

display(model.leaderboard)

In [None]:
best_model = model.leader

perf = best_model.model_performance(test)

print("Training AUC : {:.1%}".format(best_model.auc()))
print("Cross val AUC : {:.1%}".format(best_model.auc(xval=True)))
print("Testing AUC : {:.1%}".format(perf.auc()))