In [18]:

!pip install boto3
!pip install h2o


Collecting h2o
  Downloading h2o-3.46.0.6.tar.gz (265.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.8/265.8 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.46.0.6-py2.py3-none-any.whl size=265859786 sha256=98575453beae511aa11bd6f2dc9131654d12d795f93205b99f5bca22bbdccda9
  Stored in directory: /root/.cache/pip/wheels/0c/95/eb/b449c73f478dbc4557b80170cf7f2aa6db15862877e9a63536
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.46.0.6


Importing Libraries

In [19]:
import boto3
import pandas as pd
from io import StringIO
import os
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split

## Data Import from S3

Credentials for access

In [23]:

# Initialize AWS session
session = boto3.Session(
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name='us-east-1'
)

s3 = session.client('s3')

Accessing the data from S3

In [24]:
# Step 1: Load the CSV file from S3
bucket = 'newscatayerhs'
key = 'Customer-Churn-Records.csv'

obj = s3.get_object(Bucket=bucket, Key=key)
csv_string = obj['Body'].read().decode('utf-8')

# Step 2: Read the CSV into a pandas DataFrame
data = StringIO(csv_string)
df = pd.read_csv(data)

print("Data Loaded Successfully:")
print(df.head())

Data Loaded Successfully:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  Complain  Satisfaction Score Card Type  \
0        101348.88       1         1                   2   DIA

## H2O for Model building

In [39]:
# Step 3: Start the H2O cluster
h2o.init()

# Step 4: Define significant features and the target column
significant_features = [
    'Gender', 'Geography', 'IsActiveMember', 'Age', 'CreditScore',
    'Balance', 'Satisfaction Score', 'Point Earned', 'Tenure',
    'EstimatedSalary', 'NumOfProducts', 'Card Type', 'HasCrCard','Complain'
]
target = 'Exited'

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,19 mins 32 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,"7 days, 20 hours and 35 minutes"
H2O_cluster_name:,H2O_from_python_unknownUser_ur1gyx
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.073 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


### Test Train split and preprocessing

In [40]:

# Step 5: Preprocess the data (ensure target column is categorical)
# Convert categorical columns using Label Encoding
categorical_columns = ['Gender', 'Geography', 'Card Type']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Split the data into training and test sets
X = df[significant_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Combine training data for H2O
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

# Step 6: Convert pandas DataFrames to H2OFrames
h2o_data = h2o.H2OFrame(df_train)
h2o_test = h2o.H2OFrame(df_test)

# Ensure the target column is treated as categorical
h2o_data[target] = h2o_data[target].asfactor()
h2o_test[target] = h2o_test[target].asfactor()


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### AutoMl Model


In [42]:
# Step 7: Initialize H2O AutoML
aml = H2OAutoML(max_models=20, seed=42, verbosity="info")


### Training


In [43]:

# Step 8: Train the AutoML model on the training set
aml.train(x=significant_features, y=target, training_frame=h2o_data)

AutoML progress: |
00:52:12.62: Project: AutoML_2_20241110_05212
00:52:12.62: 5-fold cross-validation will be used.
00:52:12.65: Setting stopping tolerance adaptively based on the training frame: 0.011180339887498949
00:52:12.65: Build control seed: 42
00:52:12.66: training frame: Frame key: AutoML_2_20241110_05212_training_py_6_sid_a487    cols: 15    rows: 8000  chunks: 1    size: 152556  checksum: -3697312703151767348
00:52:12.66: validation frame: NULL
00:52:12.66: leaderboard frame: NULL
00:52:12.66: blending frame: NULL
00:52:12.66: response column: Exited
00:52:12.66: fold column: null
00:52:12.66: weights column: null
00:52:12.66: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (7g, 10w)]}, {DeepLearning : [

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,88.0,88.0,75871.0,7.0,7.0,7.0,27.0,99.0,63.886364

Unnamed: 0,0,1,Error,Rate
0,6355.0,0.0,0.0,(0.0/6355.0)
1,0.0,1645.0,0.0,(0.0/1645.0)
Total,6355.0,1645.0,0.0,(0.0/8000.0)

metric,threshold,value,idx
max f1,0.7610675,1.0,228.0
max f2,0.7610675,1.0,228.0
max f0point5,0.7610675,1.0,228.0
max accuracy,0.7610675,1.0,228.0
max precision,0.9998097,1.0,0.0
max recall,0.7610675,1.0,228.0
max specificity,0.9998097,1.0,0.0
max absolute_mcc,0.7610675,1.0,228.0
max min_per_class_accuracy,0.7610675,1.0,228.0
max mean_per_class_accuracy,0.7610675,1.0,228.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.9996833,4.8632219,4.8632219,1.0,0.9997277,1.0,0.9997277,0.0486322,0.0486322,386.3221884,386.3221884,0.0486322
2,0.02,0.9996231,4.8632219,4.8632219,1.0,0.9996516,1.0,0.9996897,0.0486322,0.0972644,386.3221884,386.3221884,0.0972644
3,0.03,0.9995793,4.8632219,4.8632219,1.0,0.9996036,1.0,0.999661,0.0486322,0.1458967,386.3221884,386.3221884,0.1458967
4,0.04,0.9995124,4.8632219,4.8632219,1.0,0.9995406,1.0,0.9996309,0.0486322,0.1945289,386.3221884,386.3221884,0.1945289
5,0.05,0.9994343,4.8632219,4.8632219,1.0,0.9994766,1.0,0.9996,0.0486322,0.2431611,386.3221884,386.3221884,0.2431611
6,0.1,0.9990084,4.8632219,4.8632219,1.0,0.9992354,1.0,0.9994177,0.2431611,0.4863222,386.3221884,386.3221884,0.4863222
7,0.15,0.9982505,4.8632219,4.8632219,1.0,0.9986844,1.0,0.9991733,0.2431611,0.7294833,386.3221884,386.3221884,0.7294833
8,0.2,0.9940503,4.8632219,4.8632219,1.0,0.997203,1.0,0.9986807,0.2431611,0.9726444,386.3221884,386.3221884,0.9726444
9,0.3,0.0004737,0.2735562,3.3333333,0.05625,0.0574346,0.6854167,0.684932,0.0273556,1.0,-72.6443769,233.3333333,0.8811959
10,0.4,0.0003406,0.0,2.5,0.0,0.0003978,0.5140625,0.5137984,0.0,1.0,-100.0,150.0,0.7553108

Unnamed: 0,0,1,Error,Rate
0,6346.0,9.0,0.0014,(9.0/6355.0)
1,3.0,1642.0,0.0018,(3.0/1645.0)
Total,6349.0,1651.0,0.0015,(12.0/8000.0)

metric,threshold,value,idx
max f1,0.8590504,0.9963592,247.0
max f2,0.8590504,0.9974487,247.0
max f0point5,0.8590504,0.9952722,247.0
max accuracy,0.8590504,0.9985,247.0
max precision,0.9999139,1.0,0.0
max recall,0.0004702,1.0,344.0
max specificity,0.9999139,1.0,0.0
max absolute_mcc,0.8590504,0.9954173,247.0
max min_per_class_accuracy,0.8590504,0.9981763,247.0
max mean_per_class_accuracy,0.8590504,0.99838,247.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.9997796,4.8632219,4.8632219,1.0,0.9998283,1.0,0.9998283,0.0486322,0.0486322,386.3221884,386.3221884,0.0486322
2,0.02,0.9996974,4.8632219,4.8632219,1.0,0.9997368,1.0,0.9997826,0.0486322,0.0972644,386.3221884,386.3221884,0.0972644
3,0.03,0.9996243,4.8632219,4.8632219,1.0,0.9996655,1.0,0.9997435,0.0486322,0.1458967,386.3221884,386.3221884,0.1458967
4,0.04,0.9995191,4.8632219,4.8632219,1.0,0.9995702,1.0,0.9997002,0.0486322,0.1945289,386.3221884,386.3221884,0.1945289
5,0.05,0.9994201,4.8632219,4.8632219,1.0,0.9994713,1.0,0.9996544,0.0486322,0.2431611,386.3221884,386.3221884,0.2431611
6,0.1,0.9988074,4.8510638,4.8571429,0.9975,0.9991559,0.99875,0.9994052,0.2425532,0.4857143,385.106383,385.7142857,0.4855569
7,0.15,0.9975511,4.8632219,4.8591692,1.0,0.9982826,0.9991667,0.999031,0.2431611,0.7288754,386.3221884,385.91692,0.728718
8,0.2,0.9885469,4.8024316,4.8449848,0.9875,0.9954949,0.99625,0.998147,0.2401216,0.968997,380.2431611,384.4984802,0.9680528
9,0.3,0.0006252,0.3039514,3.331307,0.0625,0.0634388,0.685,0.6865776,0.0303951,0.9993921,-69.6048632,233.1306991,0.8804306
10,0.4,0.0004256,0.006079,2.5,0.00125,0.000508,0.5140625,0.5150602,0.0006079,1.0,-99.3920973,150.0,0.7553108

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9985,0.0009479,0.999375,0.9975,0.9975,0.999375,0.99875
aic,,0.0,,,,,
auc,0.9995234,0.0004800,0.9995788,0.9994800,0.9987419,0.9999776,0.9998390
err,0.0015,0.0009479,0.000625,0.0025,0.0025,0.000625,0.00125
err_count,2.4,1.5165751,1.0,4.0,4.0,1.0,2.0
f0point5,0.9953013,0.0038736,0.9976176,0.9902201,0.9922527,0.9993573,0.9970588
f1,0.9963716,0.0022860,0.9985097,0.993865,0.9940299,0.9983948,0.9970588
f2,0.9974493,0.0012898,0.9994034,0.9975370,0.9958134,0.9974343,0.9970588
lift_top_group,4.8677797,0.1683653,4.776119,4.9382715,4.790419,5.1282053,4.7058825
loglikelihood,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2024-11-10 00:53:06,11.623 sec,0.0,0.4041576,0.5081021,0.5,0.205625,1.0,0.794375
,2024-11-10 00:53:06,11.749 sec,5.0,0.2711825,0.2759509,0.9998579,0.9995668,4.8632219,0.001
,2024-11-10 00:53:06,11.869 sec,10.0,0.1770524,0.1632038,0.999968,0.9998797,4.8632219,0.001
,2024-11-10 00:53:06,11.991 sec,15.0,0.1180252,0.1018138,0.9999745,0.9999073,4.8632219,0.00075
,2024-11-10 00:53:06,12.118 sec,20.0,0.0806634,0.065334,0.9999974,0.9999901,4.8632219,0.000375
,2024-11-10 00:53:06,12.238 sec,25.0,0.0576807,0.043066,0.9999974,0.9999901,4.8632219,0.000375
,2024-11-10 00:53:06,12.318 sec,30.0,0.0420894,0.0269713,0.9999974,0.9999901,4.8632219,0.000375
,2024-11-10 00:53:07,12.396 sec,35.0,0.0348928,0.0196127,0.9999974,0.9999901,4.8632219,0.000375
,2024-11-10 00:53:07,12.469 sec,40.0,0.0291907,0.0135421,0.9999976,0.9999908,4.8632219,0.000375
,2024-11-10 00:53:07,12.555 sec,45.0,0.0250901,0.0100578,0.9999983,0.9999934,4.8632219,0.000375

variable,relative_importance,scaled_importance,percentage
Complain,5104.534668,1.0,0.8182994
Age,572.5467529,0.1121643,0.091784
NumOfProducts,261.487793,0.0512266,0.0419187
IsActiveMember,103.7399063,0.0203231,0.0166304
Balance,68.6955109,0.0134577,0.0110125
Geography,30.7040462,0.0060151,0.0049221
EstimatedSalary,23.9019184,0.0046825,0.0038317
Point Earned,21.6610413,0.0042435,0.0034724
Card Type,12.5476036,0.0024581,0.0020115
Tenure,11.8227377,0.0023161,0.0018953


In [44]:
# Step 9: View the leaderboard for the best models
lb = aml.leaderboard
print("H2O AutoML Leaderboard:")
print(lb.head(rows=lb.nrows))



H2O AutoML Leaderboard:
model_id                                                     auc     logloss     aucpr    mean_per_class_error       rmse         mse
GBM_2_AutoML_2_20241110_05212                           0.999599  0.00909586  0.998451              0.00161996  0.0384731  0.00148018
GBM_3_AutoML_2_20241110_05212                           0.999579  0.00923378  0.998412              0.00161996  0.0385214  0.0014839
GBM_5_AutoML_2_20241110_05212                           0.999571  0.00904848  0.998502              0.00161996  0.0385466  0.00148584
StackedEnsemble_BestOfFamily_1_AutoML_2_20241110_05212  0.999478  0.008662    0.998734              0.00161996  0.038334   0.0014695
DeepLearning_grid_3_AutoML_2_20241110_05212_model_1     0.999478  0.00953911  0.997465              0.00161996  0.0390039  0.0015213
GBM_4_AutoML_2_20241110_05212                           0.999455  0.00924641  0.998014              0.00161996  0.0385376  0.00148514
XGBoost_2_AutoML_2_20241110_05212        

### Evaluate on Test data

In [45]:
# Step 10: Evaluate the best model on the test set
best_model = aml.leader
perf = best_model.model_performance(h2o_test)
print("Model Performance:")
print(perf)


Model Performance:
ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0009888358994221808
RMSE: 0.03144576123139939
LogLoss: 0.006614214068769838
Mean Per-Class Error: 0.0015834033989337362
AUC: 0.9995677308720912
AUCPR: 0.9989392509447373
Gini: 0.9991354617441823

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9329197367399474
       0     1    Error    Rate
-----  ----  ---  -------  ------------
0      1606  1    0.0006   (1.0/1607.0)
1      1     392  0.0025   (1.0/393.0)
Total  1607  393  0.001    (2.0/2000.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.93292      0.997455  206
max f2                       0.93292      0.997455  206
max f0point5                 0.93292      0.997455  206
max accuracy                 0.93292      0.999     206
max precision                0.999811     1         0
ma

### Store in Mojo format

In [46]:

# Step 11: Save the best model to a local file
model_path = best_model.download_mojo(path="/tmp")
model_filename = model_path.split('/')[-1]

print(f"Model saved locally as {model_filename}")

Model saved locally as GBM_2_AutoML_2_20241110_05212.zip


### Save it to S3


In [47]:

# Step 12: Upload the model to S3
s3.upload_file(
    Filename=model_path,
    Bucket=bucket,
    Key=f"h2o_models/{model_filename}"
)

print(f"Model uploaded to S3 at s3://{bucket}/h2o_models/{model_filename}")

Model uploaded to S3 at s3://newscatayerhs/h2o_models/GBM_2_AutoML_2_20241110_05212.zip


### Check if saved

In [48]:
response = s3.list_objects_v2(Bucket=bucket, Prefix="h2o_models/")
for obj in response.get('Contents', []):
    print(obj['Key'])


h2o_models/GBM_2_AutoML_2_20241110_05212.zip
