## Precondition
To run the H20, download the latest version, and use java 11. Example of launch
```shell
path-to-java-11.exe -jar -Xmx16g -Xms8g  path-to-h2o.jar > NUL 2>&1
```

In [3]:
from sklearn.metrics import accuracy_score
from keras.src.metrics.accuracy_metrics import accuracy
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# %%
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from ass3.AutoMLClassifier import AutoMLClassifier
from ass3.BigDaddyWrapper import BigDaddyWrapper


def load_data(dataset_path, column_types_path):
    data = pd.read_csv(dataset_path)
    with open(column_types_path, 'rb') as feature_file:
        feature_structure = pickle.load(feature_file)
    return data, feature_structure


def prepare_data(data, feature_structure):
    """Prepare data for training and testing: preprocess features and split data."""

    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.model_selection import train_test_split

    # Extract features and target
    feature_columns = (
            feature_structure['bin'] +
            feature_structure['cat'] +
            feature_structure['cont'] +
            feature_structure['ord']
    )
    X = data[feature_columns]
    y = data[feature_structure['target']]

    # Check for missing values and clean the data
    for col in feature_structure['bin'] + feature_structure['cat']:
        # Ensure proper dtype conversion to 'category'
        data[col] = data[col].astype("category")
        # Replace missing values with placeholder (e.g., "Unknown")
        data[col] = data[col].cat.add_categories("Unknown").fillna("Unknown")

    for col in feature_structure['cont']:
        # Fill missing continuous values with column mean (or another strategy)
        data[col] = data[col].fillna(data[col].mean())

    for col in feature_structure['ord']:
        # Ensure ordinal columns are numeric and have no missing values
        data[col] = pd.to_numeric(data[col], errors="coerce")  # Convert to numeric
        data[col] = data[col].fillna(data[col].median())  # Fill missing with median

    # Define preprocessing pipeline
    binary_and_cat_columns = feature_structure['bin'] + feature_structure['cat']
    continuous_columns = feature_structure['cont']
    ordinal_columns = feature_structure['ord']

    preprocessor = ColumnTransformer(
        transformers=[
            ("binary_cat", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse_output=False),
             binary_and_cat_columns),
            ("continuous", StandardScaler(), continuous_columns),
            ("ordinal", "passthrough", ordinal_columns)
        ]
    )

    # Apply preprocessing
    X_processed = preprocessor.fit_transform(data[feature_columns])

    # Convert the processed data back to DataFrame
    processed_feature_names = preprocessor.get_feature_names_out()
    X_processed_df = pd.DataFrame(X_processed, columns=processed_feature_names)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed_df, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test




def train_and_evaluate(automl_classifier, X_train, X_test, y_train, y_test):
    # # Train with TPOT
    automl_classifier.train_tpot(X_train, y_train)
    tpot_predictions = automl_classifier.predict_tpot(X_test)
    tpot_accuracy = automl_classifier.evaluate(y_test, tpot_predictions)
    print(f"[+] TPOT Accuracy: {tpot_accuracy}")

    # Train with H2O AutoML
    automl_classifier.train_h2o(X_train, y_train)
    h2o_predictions = automl_classifier.predict_h2o(X_test)
    h2o_accuracy = automl_classifier.evaluate(y_test, h2o_predictions)
    print(f"[+] H2O AutoML Accuracy: {h2o_accuracy}")

    # Train with Big Daddy

def train_and_evaluate_big_daddy(data, feature_structure):
    wrapper = BigDaddyWrapper(data, feature_structure)
    wrapper.train_model()

    X_train, X_test, y_train, y_test = prepare_data(data, feature_structure)
    predict = wrapper.predict(X_test)
    accuracy = accuracy_score(y_test, predict)
    print(f"[+] Big Daddy Accuracy: {accuracy}")


In [None]:
datasets = [
    {
        "dataset_path": "./data/congress_voting/CongressionalVotingID.shuf.lrn.csv",
        "column_types_path": "./data/congress_voting/congressional-voting.pkl",
    },
    {
        "dataset_path": "./data/breast_cancer/breast-cancer-diagnostic.shuf.lrn.csv",
        "column_types_path": "./data/breast_cancer/breast-cancer_column_types.pkl",
    },
    {
        "dataset_path": "./data/alzheimer/alzheimers_prediction_dataset.csv",
        "column_types_path": "./data/alzheimer/alzheimer_dataset.pkl",
    },
    {
        "dataset_path": "./data/placement/placementdata.csv",
        "column_types_path": "./data/placement/placement_metadata.pkl",
    }

]

automl_classifier = AutoMLClassifier()

for dataset in datasets:
    data, feature_structure = load_data(dataset["dataset_path"], dataset["column_types_path"])
    X_train, X_test, y_train, y_test = prepare_data(data, feature_structure)

    print(f"\n Training on dataset: {dataset['dataset_path']}")

    train_and_evaluate(automl_classifier, X_train, X_test, y_train, y_test)
    train_and_evaluateBigDaddy(data, feature_structure)



 Training on dataset: ./data/congress_voting/CongressionalVotingID.shuf.lrn.csv
[*] Training TPOT Classifier
[+] Finished training TPOT Classifier
[*] Best pipeline configuration found by TPOT:
Pipeline(steps=[('extratreesclassifier',
                 ExtraTreesClassifier(criterion='entropy',
                                      max_features=0.35000000000000003,
                                      min_samples_leaf=18, min_samples_split=9,
                                      random_state=42))])
[*] Making predictions with TPOT
[+] TPOT Accuracy: 1.0
[*] Training H2O AutoML...
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,6 mins 05 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,3 months and 22 days
H2O_cluster_name:,H2O_from_python_Dmytro_3r8fig
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,14.57 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
17:23:00.277: AutoML: XGBoost is not available; skipping it.
17:23:00.365: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 174.0.

████████████████████████████████████████████████████