In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Load the dataset into a dataframe
df = pd.read_csv("data.csv")
print(f"Number of columns: {len(df.columns)}")
print(f"Number of rows: {len(df.index)}")

# Map "M" to 1 and "B" to 0
df["diagnosis"] = df["diagnosis"].replace({"M": 1, "B": 0})

df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
print(f"Number of columns after dropping: {len(df.columns)}")

selected_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean']

X = df[selected_features]
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert training data to H2O Frame
train_h2o = h2o.H2OFrame(pd.concat([pd.DataFrame(X_train, columns=selected_features), y_train.reset_index(drop=True)], axis=1))
train_h2o['diagnosis'] = train_h2o['diagnosis'].asfactor()  

x = selected_features  # Features
y = 'diagnosis'  

aml = H2OAutoML(max_runtime_secs=300, seed=7654321, balance_classes=True)

# Train AutoML
aml.train(x=x, y=y, training_frame=train_h2o)

# Display the leaderboard
lb = aml.leaderboard
print(lb.head())

# Save the best model
best_model = aml.leader
h2o.save_model(best_model, path="best_h2o_model")



Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,15 mins 35 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_Purushotham_Kilari_5ia8on
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.501 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Number of columns: 33
Number of rows: 569
Number of columns after dropping: 31
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
11:44:40.212: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse        mse
DeepLearning_grid_1_AutoML_2_20240914_114440_model_6     0.986243   0.144007  0.981652               0.0654324  0.207931  0.0432355
DeepLearning_grid_1_AutoML_2_20240914_114440_model_1     0.985676   0.161634  0.978857               0.0694324  0.219512  0.0481854
DeepLearning_grid_1_AutoML_2_20240914_114440_model_7     0.985486   0.219227  0.978878               0.0612973  0.234035  0.0547725
DeepLearning_grid_2_AutoML_2_20240914_114440_model_1     0.985432   0.173977  0.9788                 0.0708108  0.228871  0.0523819
DeepLear

'E:\\projects\\Mlops-Assignment-2\\best_h2o_model\\DeepLearning_grid_1_AutoML_2_20240914_114440_model_6'