In [1]:
import h2o
from h2o.automl import H2OAutoML, get_leaderboard

In [2]:
import pandas as pd
import json

from sklearn.metrics import f1_score, accuracy_score

In [3]:
# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "22.0.1" 2024-04-16; Java(TM) SE Runtime Environment (build 22.0.1+8-16); Java HotSpot(TM) 64-Bit Server VM (build 22.0.1+8-16, mixed mode, sharing)
  Starting server from /home/sounak/anaconda3/envs/mlops/lib/python3.11/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpewfvjlg0
  JVM stdout: /tmp/tmpewfvjlg0/h2o_sounak_started_from_python.out
  JVM stderr: /tmp/tmpewfvjlg0/h2o_sounak_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,4 days
H2O_cluster_name:,H2O_from_python_sounak_j9yx4c
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.395 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [None]:
# Import data directly as H2O frame
main_frame = h2o.import_file(path='data/processed/train.csv')

# Save data types of columns in H2O frame (for matching with test set during prediction)
with open('data/processed/train_col_types.json', 'w') as fp:
    json.dump(main_frame.types, fp)

In [None]:
# Set predictor and target columns
target = 'Response'
predictors = [n for n in main_frame.col_names if n != target]

# Factorize target variable so that autoML tackles classification problem (instead of regression)
main_frame[target] = main_frame[target].asfactor()

# Visualize H2O frame structure
main_frame.head()

In [None]:
aml = H2OAutoML(
                    max_models=2, # Run AutoML for n base models
                    seed=42, 
                    balance_classes=True, # Our target classes are imbalanced, so we set this to True
                    sort_metric='logloss', # Sort models by logloss (main metric for multi-classification)
                    verbosity='info', # Turn on verbose info
                    exclude_algos = ['GBM','GLM', 'DRF','StackedEnsemble','DeepLearning'], # Specify which algorithms to exclude
                   )

In [None]:
aml.train(x=predictors, y=target, training_frame=main_frame)

In [None]:
# Get AutoML event log
log = aml.event_log
log

In [None]:
# Leader (best) model stored here
aml.leader