In [None]:
import h2o
from h2o.automl import H2OAutoML, get_leaderboard

In [None]:
import pandas as pd
import json

from sklearn.metrics import f1_score, accuracy_score

In [None]:
# Start the H2O cluster (locally)
h2o.init()

In [None]:
# Import data directly as H2O frame
main_frame = h2o.import_file(path='data/processed/train.csv')

# Save data types of columns in H2O frame (for matching with test set during prediction)
with open('data/processed/train_col_types.json', 'w') as fp:
    json.dump(main_frame.types, fp)

In [None]:
# Set predictor and target columns
target = 'Response'
predictors = [n for n in main_frame.col_names if n != target]

# Factorize target variable so that autoML tackles classification problem (instead of regression)
main_frame[target] = main_frame[target].asfactor()

# Visualize H2O frame structure
main_frame.head()

In [None]:
aml = H2OAutoML(
                    max_models=2, # Run AutoML for n base models
                    seed=42, 
                    balance_classes=True, # Our target classes are imbalanced, so we set this to True
                    sort_metric='logloss', # Sort models by logloss (main metric for multi-classification)
                    verbosity='info', # Turn on verbose info
                    exclude_algos = ['GBM','GLM', 'DRF','StackedEnsemble','DeepLearning'], # Specify which algorithms to exclude
                   )

In [None]:
aml.train(x=predictors, y=target, training_frame=main_frame)

In [None]:
# Get AutoML event log
log = aml.event_log
log

In [None]:
# Leader (best) model stored here
aml.leader