# H2O AutoML 


## Setup

In [None]:
import sys
import os

# Get the current working directory
current_working_directory = os.getcwd()

# Go up one level from the current working directory
parent_directory = os.path.join(current_working_directory, '..')

# Add the parent directory to sys.path
sys.path.append(parent_directory)

os.getcwd()

In [None]:
%pip install h2o

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

from src.ml_service import prepare_data, prepare_test_data, save_predictions
from src.config import TARGET_FEATURE

## Load data

In [None]:
x_train, _, x_test, y_train, _, y_test = prepare_data(validation_size=0, test_size=0.1)
training_data = pd.concat([x_train, y_train], axis=1)

# Convert the pandas DataFrame to H2O Frame
training_data = h2o.H2OFrame(training_data)

## Train model

In [None]:
# Start the H2O cluster (locally)
h2o.init()
training_features: list[str] = x_train.columns.tolist()
# Run AutoML for 20 base models
predictor = H2OAutoML(max_models=20, seed=1)

In [None]:
predictor.train(x=training_features, y=TARGET_FEATURE, training_frame=training_data)

# View the AutoML Leaderboard
lb = predictor.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)



## Make predictions

In [None]:
# Evaluate on the test set
y_test_pred = predictor.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy: ", test_accuracy)
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

## Save model

In [None]:
x_final_test = prepare_test_data()
final_predictions = predictor.predict(x_final_test)

In [None]:
# Save the final predictions as a CSV file
save_predictions(final_predictions, f'h2o_automl')

In [None]:
# Get the best model using the metric
m = predictor.leader
# this is equivalent to# Get the best model using the metric
m = predictor.leader
# this is equivalent to
m = predictor.get_best_model()

# Get the best model using a non-default metric
m = predictor.get_best_model(criterion="logloss")

# Get the best XGBoost model using default sort metric
xgb = predictor.get_best_model(algorithm="xgboost")

# Get the best XGBoost model, ranked by logloss
xgb = predictor.get_best_model(algorithm="xgboost", criterion="logloss")
m = predictor.get_best_model()

# Get the best model using a non-default metric
m = predictor.get_best_model(criterion="logloss")

# Get the best XGBoost model using default sort metric
xgb = predictor.get_best_model(algorithm="xgboost")

# Get the best XGBoost model, ranked by logloss
xgb = predictor.get_best_model(algorithm="xgboost", criterion="logloss")

In [None]:
# View the parameters for the XGBoost model selected above
xgb.params.keys()

## Explaining
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/explain.html
