In [1]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "18.0.1.1" 2022-04-22; Java(TM) SE Runtime Environment (build 18.0.1.1+2-6); Java HotSpot(TM) 64-Bit Server VM (build 18.0.1.1+2-6, mixed mode, sharing)
  Starting server from /opt/homebrew/Caskroom/miniforge/base/envs/ML/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/lf/7qfqd7h11kbb3jmk2tcyjwch0000gn/T/tmpwzqn_npy
  JVM stdout: /var/folders/lf/7qfqd7h11kbb3jmk2tcyjwch0000gn/T/tmpwzqn_npy/h2o_dhruvnagill_started_from_python.out
  JVM stderr: /var/folders/lf/7qfqd7h11kbb3jmk2tcyjwch0000gn/T/tmpwzqn_npy/h2o_dhruvnagill_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.3
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_dhruvnagill_iuw18s
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
df = pd.read_excel('address_data-1.xlsx')


In [5]:
print("Columns in the dataset:")
for col in df.columns:
    print(f"- {col}")

Columns in the dataset:
- FDR ID
- Organization Name
- Physical Street
- Physical City
- Physical State
- Physical Country
- Physical Postal1
- Physical Subdivision
- Ref URL


In [6]:
h2o_df = h2o.H2OFrame(df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [27]:
h2o_df.types

{'FDR ID': 'int',
 'Organization Name': 'string',
 'Physical Street': 'string',
 'Physical City': 'enum',
 'Physical State': 'enum',
 'Physical Country': 'enum',
 'Physical Postal1': 'int',
 'Physical Subdivision': 'enum',
 'Ref URL': 'string'}

In [28]:
numeric_cols = [col for col in h2o_df.columns if h2o_df.types[col] in ["numeric", "int"]]
categorical_cols = [col for col in h2o_df.columns if h2o_df.types[col] in ["enum", "string"]]


In [29]:
print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


Numeric columns: ['FDR ID', 'Physical Postal1']
Categorical columns: ['Organization Name', 'Physical Street', 'Physical City', 'Physical State', 'Physical Country', 'Physical Subdivision', 'Ref URL']


In [30]:
potential_targets = categorical_cols
print("\nPotential target columns:", potential_targets)


Potential target columns: ['Organization Name', 'Physical Street', 'Physical City', 'Physical State', 'Physical Country', 'Physical Subdivision', 'Ref URL']


In [31]:
def run_automl(target):
    features = [col for col in h2o_df.columns if col != target]
    aml = H2OAutoML(max_runtime_secs=300, seed=42)
    aml.train(x=features, y=target, training_frame=h2o_df)
    return aml

In [32]:
type(potential_targets)

list

In [33]:
results = {}
if(type(potential_targets) == str):
    target = potential_targets
    print(f"\nRunning AutoML for target: {target}")
    try:
        aml = run_automl(target)
        performance = aml.leader.model_performance(h2o_df)
        results[target] = {
            'model': aml.leader,
            'performance': performance.auc()
        }
    except Exception as e:
        print(f"Error occurred for target {target}: {str(e)}")
else:
    for target in potential_targets:
        print(f"\nRunning AutoML for target: {target}")
        try:
            aml = run_automl(target)
            performance = aml.leader.model_performance(h2o_df)
            results[target] = {
                'model': aml.leader,
                'performance': performance.auc()
            }
        except Exception as e:
            print(f"Error occurred for target {target}: {str(e)}")



Running AutoML for target: Organization Name
AutoML progress: |
23:18:51.49: AutoML: XGBoost is not available; skipping it.
23:18:51.54: _train param, Dropping bad and constant columns: [Physical Street, Ref URL]
23:18:51.54: _response_column param, Use numerical, categorical or time variable. Currently used String
23:18:51.56: _train param, Dropping bad and constant columns: [Physical Street, Ref URL]
23:18:51.56: _response_column param, Use numerical, categorical or time variable. Currently used String
23:18:51.57: _train param, Dropping bad and constant columns: [Physical Street, Ref URL]
23:18:51.57: _response_column param, Use numerical, categorical or time variable. Currently used String
23:18:51.58: _train param, Dropping bad and constant columns: [Physical Street, Ref URL]
23:18:51.58: _response_column param, Use numerical, categorical or time variable. Currently used String
23:18:51.58: _train param, Dropping bad and constant columns: [Physical Street, Ref URL]
23:18:51.58: _

: 

: 

In [None]:
if results:
  
    best_target = max(results, key=lambda x: results[x]['performance'])
    best_model = results[best_target]['model']

    print(f"\nBest performing model predicts: {best_target}")
    print(f"Model performance (AUC): {results[best_target]['performance']}")

    
    def predict(data):
        h2o_data = h2o.H2OFrame(data)
        predictions = best_model.predict(h2o_data)
        return predictions.as_data_frame()


    print("\nExample prediction:")
    example_data = h2o_df[:1]  
    print(predict(example_data))
else:
    print("No successful models were trained.")


In [None]:
print(f"Best Model is : {best_model}")
print(f"Best target is : {best_target}")

In [None]:
h2o.shutdown()