In [1]:
## configuration

DECISION_TREE_CL_CONFIG = {"name": "DecisionTreeClassifier",
                           "model": DecisionTreeClassifier(random_state=42),
                           'metric': 'roc_auc',
                           'param_config':{
                               'max_depth': {'percentage_splits': [0.25, 0.50, 0.70, 0.8, 0.9, 0.999], 'param_type':"int", 'dependency':'n_samples'},
                               'min_samples_split': {'percentage_splits': [0.005, 0.01, 0.02, 0.05, 0.10], 'param_type':"float"},
                               'min_samples_leaf': {'percentage_splits': [0.005, 0.01, 0.02, 0.05, 0.10], 'param_type':"float"},
                               'max_features': {'percentage_splits': [0.50, 0.70, 0.8, 0.9, 0.99], 'param_type':"float"}
                           }
                          }


ml_config = DECISION_TREE_CL_CONFIG_4PARAM
print(ml_config)

In [2]:

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
import openml
import random
import joblib
import os

pd.set_option('display.max_colwidth', None)

exp_id = '20240812' #'20240323' #'20240216'
output_root = f'./output/{exp_id}/'
os.makedirs(output_root, exist_ok=True)


# core knowledge base functions
def fetch_open_ml_data(dataset_id):

    dataset = openml.datasets.get_dataset(dataset_id)

    print(f'Dataset name: {dataset.name}')

    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )
    df = pd.DataFrame(X, columns=attribute_names)
    df["target"] = y

    return df, 'target', dataset.name

def prepare_data(df, target_name):
    """Simple preprocessing wrapper function
    :param df. pandas dataframe containing dataset
    :param target_name. the name of the target variable column
    :return data dict containing the preprocessed pandas dataframes"""
    y = df[target_name]
    X = df.drop(target_name, axis=1)
    X.fillna(0, inplace=True)

    return X, y

def _calculate_dataset_size(X):
    return {"n_samples": X.shape[0], "n_features": X.shape[1]}

def _calculate_class_imbalance_ratio(y):
    """
    Calculate the class imbalance ratio of a dataset.

    Parameters:
    y: array-like, shape (n_samples,)
       Target values (class labels).

    Returns:
    float: The ratio of the majority class size to the minority class size.
    """
    # Count the occurrences of each class
    class_counts = np.bincount(y)

    # Find the counts of majority and minority classes
    majority_class_count = np.max(class_counts)
    minority_class_count = np.min(class_counts)

    # Calculate the imbalance ratio
    imbalance_ratio = majority_class_count / minority_class_count

    return {"imbalance_ratio": imbalance_ratio}

def _calculate_correlation_metrics(X, y, correlation_cutoff=0.1):
    """
    Calculates and returns correlation metrics between features in X and the target variable y,
    filtering for features that have a correlation above a specified cutoff.

    Parameters:
    - X (array-like, DataFrame): The input features, where rows represent samples and columns represent features.
    - y (array-like, Series): The target variable for which correlations with features in X are computed.
    - correlation_cutoff (float, optional): The minimum absolute correlation value for a feature to be considered
      informative with respect to the target variable. Defaults to 0.1.

    Returns:
    - dict: A dictionary containing the following key-value pairs:
        - 'n_highly_target_corr': The number of features that have an absolute correlation with the target
          variable greater than the specified cutoff.
        - 'avg_target_corr': The average absolute correlation of all features with the target variable.
        - 'var_target_corr': The variance of the absolute correlations of all features with the target variable.

    Note:
    - The function uses Pearson correlation by default but can be adjusted to use 'spearman' or 'kendall'
      for non-linear relationships by modifying the `corr` method call.
    """
    df = pd.DataFrame(X.copy())
    df['target'] = pd.Series(y.copy())
    correlation_matrix = df.corr(method='pearson')  # Use 'spearman' or 'kendall' for non-linear relationships
    correlations_with_target = abs(correlation_matrix['target'])

    informative_features = correlations_with_target[correlations_with_target > correlation_cutoff].sort_values(ascending=False)
    n_informative = len(informative_features) - 1

    return {'n_highly_target_corr': n_informative  # Number of highly target correlated features
            , 'avg_target_corr' : correlations_with_target.mean() # Avg target correlation
            , 'var_target_corr' : correlations_with_target.var() # Variance of target correlation
           }

def _calculate_feature_moments_and_variances(X):
    """
    Calculate the first four statistical moments (mean, variance, skewness, kurtosis), their averages,
    and variances for each feature in the dataset.

    Parameters:
    X (DataFrame): DataFrame containing the feature set.

    Returns:
    DataFrame: A DataFrame with the first four moments, their averages, and variances for each feature.
    """
    # Converting to DataFrame if not already (for compatibility)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    # Calculate first moment (mean) for each col
    moment_1 = X.apply(lambda x: x.mean(), axis=0)

    # Calculate second moment (variance) for each col
    moment_2 = X.apply(lambda x: x.var(), axis=0)

    # Calculate third moment (skewness) for each col
    moment_3 = X.apply(lambda x: skew(x.dropna()), axis=0)

    # Calculate fourth moment (kurtosis) for each col
    moment_4 = X.apply(lambda x: kurtosis(x.dropna()), axis=0)

    # Calculate and add the averages and variances of all moments
    moments = {'avg_feature_m1': moment_1.mean()  # Average Mean
              , 'var_feature_m1': moment_1.var()   # Variance of Mean
              , 'avg_feature_m2': moment_2.mean()  # Average Variance
              , 'var_feature_m2': moment_2.var()   # Variance of Variance
              , 'avg_feature_m3': moment_3.mean()  # Average Skewness
              , 'var_feature_m3': moment_3.var()   # Variance of Skewness
              , 'avg_feature_m4': moment_4.mean()  # Average Kurtosis
              , 'var_feature_m4': moment_4.var()   # Variance of Kurtosis
              }

    return moments

def _calculate_row_moments_and_variances(X):
    """
    Calculate the first four statistical moments (mean, variance, skewness, kurtosis), their averages,
    and variances for each row in the dataset.

    Parameters:
    X (DataFrame): DataFrame containing the feature set.

    Returns:
    DataFrame: A DataFrame with the first four moments, their averages, and variances for each row.
    """
    # Converting to DataFrame if not already (for compatibility)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    # Calculate first moment (mean) for each row
    moment_1 = X.apply(lambda x: x.mean(), axis=1)

    # Calculate second moment (variance) for each row
    moment_2 = X.apply(lambda x: x.var(), axis=1)

    # Calculate third moment (skewness) for each row
    moment_3 = X.apply(lambda x: skew(x.dropna()), axis=1)

    # Calculate fourth moment (kurtosis) for each row
    moment_4 = X.apply(lambda x: kurtosis(x.dropna()), axis=1)

    # Calculate and add the averages and variances of all moments
    moments = {'avg_row_m1': moment_1.mean()  # Average Mean
              , 'var_row_m1': moment_1.var()   # Variance of Mean
              , 'avg_row_m2': moment_2.mean()  # Average Variance
              , 'var_row_m2': moment_2.var()   # Variance of Variance
              , 'avg_row_m3': moment_3.mean()  # Average Skewness
              , 'var_row_m3': moment_3.var()   # Variance of Skewness
              , 'avg_row_m4': moment_4.mean()  # Average Kurtosis
              , 'var_row_m4': moment_4.var()   # Variance of Kurtosis
              }

    return moments

def _calculate_skewness_kurtosis_stats(X):
    """
    Calculate the skewness and kurtosis for each numerical feature in the dataset.

    Parameters:
    X (DataFrame): DataFrame containing the feature set.

    Returns:
    DataFrame: A DataFrame with skewness and kurtosis for each feature.
    """
    # Converting to DataFrame if not already (for compatibility)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    # Calculate skewness and kurtosis for each feature
    skewness = X.apply(lambda x: skew(x.dropna()), axis=0)
    kurtosis_values = X.apply(lambda x: kurtosis(x.dropna()), axis=0)

    # Calculate average and standard deviation of skewness and kurtosis
    skewness_kurtosis_stats = {
        'average_skewness': skewness.mean(),
        'std_dev_skewness': skewness.std(),
        'average_kurtosis': kurtosis_values.mean(),
        'std_dev_kurtosis': kurtosis_values.std()
    }

    return skewness_kurtosis_stats


def calculate_dataset_meta_parameters(X, y):

    meta_parameters = {}
    meta_parameters.update( _calculate_dataset_size(X) )
    meta_parameters.update( _calculate_class_imbalance_ratio(y) )
    meta_parameters.update( _calculate_correlation_metrics(X, y, correlation_cutoff=0.10) )
    meta_parameters.update( _calculate_feature_moments_and_variances(X) )
    meta_parameters.update( _calculate_row_moments_and_variances(X) ) # experimental

    return meta_parameters


def _relative2absolute_dict(param_config, dataset_properties, param_dict):
    # Create a copy of the param_dict to avoid modifying the original
    absolute_param_dict = param_dict.copy()

    params_with_dependency = [param for param, details in param_config.items() if 'dependency' in details]
    for p in params_with_dependency:
        dependency_col = param_config[p]['dependency']
        dependency_value = dataset_properties[dependency_col]
        absolute_param_dict[p] = max(int(dependency_value * absolute_param_dict[p]), 1)

    return absolute_param_dict


ModuleNotFoundError: No module named 'openml'

In [None]:
# HPO training functions

import optuna
from optuna.trial import Trial


def evaluate_model(X, y, model_name, hyperparams, random_seed=42, n_folds=3, n_seeds=20):
    seed_scores = []

    for i in range(n_seeds):
        seed = random_seed + i
        if model_name == "DecisionTreeClassifier":
            model = DecisionTreeClassifier(random_state=seed, **hyperparams)
        elif model_name == "RandomForestClassifier":
            model = RandomForestClassifier(random_state=seed, **hyperparams)
        elif model_name == "XGBClassifier":
            model = XGBClassifier(random_state=seed, **hyperparams)

        cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
        seed_scores.append(np.mean(scores))

    final_score = np.mean(seed_scores)
    return final_score, seed_scores

def _optuna_objective(trial: Trial, X, y, param_config, meta_params, dataset_meta_params, random_seed=42):

    # Generate hyperparameters based on the trial
    hyperparams = {}
    for param, config in param_config.items():
        if 'percentage_splits' in config:
            min_value = min(config['percentage_splits'])
            max_value = max(config['percentage_splits'])
            hyperparams[param] = trial.suggest_uniform(param, min_value, max_value)
        # Add other parameter types (e.g., suggest_int, suggest_loguniform) as needed

    predicted_hyperparams = _relative2absolute_dict(param_config, dataset_meta_params, hyperparams)

    score, _ = evaluate_model(X, y, "DecisionTreeClassifier4Param", predicted_hyperparams, random_seed)
    return score

def optuna_perf_wrapper(X, y, meta_params, zerotune_warmstart=None, random_seed=42, n_trials=100):
    all_dataset_meta_params = calculate_dataset_meta_parameters(X, y)
    dataset_meta_params = {key: all_dataset_meta_params[key] for key in meta_params}
    dataset_meta_params_inc_dependencies = {key: all_dataset_meta_params[key] for key in ['n_samples']}
    dataset_meta_params_inc_dependencies.update(dataset_meta_params)

    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=random_seed))

    if zerotune_warmstart:
        print("ZeroTune warm start:", predicted_hyperparams)
        # Enqueue the ZeroTune hyperparameters trial
        study.enqueue_trial(predicted_hyperparams)

    study.optimize(lambda trial: _optuna_objective(trial, X, y, ml_config["param_config"], meta_params, dataset_meta_params_inc_dependencies), n_trials=n_trials)

    best_hyperparams = study.best_params
    best_hyperparams = _relative2absolute_dict(param_config, dataset_parameters, best_hyperparams)

    best_perf, n_seed_scores = evaluate_model(X, y, "DecisionTreeClassifier4Param", best_hyperparams, random_seed=random_seed, n_seeds=20)

    return {"best_hyperparams":best_hyperparams, "best_perf": best_perf,"n_seed_scores": n_seed_scores
            , "df_trials":study.trials_dataframe() }

In [3]:
## Train ZeroTune predictor

def train_zeroshot_hpo(df, dataset_features, targets, condition_column=None, n_iter=100):
    """
    Trains a one-shot predictor using a Random Forest Multi-Output Regressor, with the option to use groups for cross-validation.

    Args:
    df (pandas.DataFrame): The DataFrame containing the dataset.
    dataset_features (list): A list of column names in df to be used as model features.
    targets (list): A list of target column names in df.
    condition_column (str, optional): Column name to be used for defining groups in cross-validation. Defaults to None.
    n_iter (int, optional): Number of random search iterations. Defaults to 100.

    Returns:
    RandomForestRegressor: A RandomForestRegressor model fitted to the specified features and targets.
    float: The best score from cross-validation.
    """
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GroupKFold
    import numpy as np
    from scipy.stats import randint as sp_randint

    X = df[dataset_features]
    y = df[targets]

    param_dist = {
        'n_estimators': sp_randint(100, 300),
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': sp_randint(2, 11),
        'min_samples_leaf': sp_randint(1, 5),
        'max_features': ['auto', 'sqrt'],
        'bootstrap': [True, False]
    }

    # Initialize the regressor
    regressor = RandomForestRegressor(random_state=42)

    # Choose the cross-validation strategy
    if condition_column is None:
        cv_strategy = 4
    else:
        groups = df[condition_column]
        cv_strategy = GroupKFold(n_splits=4)

    # Set up RandomizedSearchCV with the chosen cross-validation strategy
    hpo_search = RandomizedSearchCV(regressor, param_distributions=param_dist, cv=cv_strategy,
                                    scoring='neg_mean_squared_error', n_jobs=4, n_iter=n_iter, random_state=42)

    # Fit the model
    # Pass groups to fit method if GroupKFold is used
    if condition_column is None:
        hpo_search.fit(X, y)
    else:
        hpo_search.fit(X, y, groups=groups)

    # Best parameters and score
    best_params = hpo_search.best_params_
    best_score = hpo_search.best_score_

    print(f'Zero-shot predictor mse: {best_score}')

    # Train the model on the entire dataset with best parameters
    best_regressor = RandomForestRegressor(**best_params, random_state=42)
    best_regressor.fit(X, y)

    return best_regressor, best_score

def predict_hyperparameters(model, X, target_columns):

    # Make predictions using the trained model
    predictions = model.predict(X)

    # Create a dictionary mapping target column names to their predicted values
    predictions_dict = {column: prediction for column, prediction in zip(target_columns, predictions[0])}

    return predictions_dict

def remove_param_prefix(param_dict):
    return {key.replace('params_', ''): value for key, value in param_dict.items()}


In [3]:
## Build a HPO knowledge base using binary classification datasets

verbose = False

dataset_id_list = [
    31, 38, 44, 52, 151, 179, 298, 846, 917,
    1049, 1053, 1111, 1112, 1120, 1128, 1220, 1464,
    1494, 1510, 1558, 4534, 23381, 40536, 40900, 45038
]

dataset_features_list = []
for dataset_id in tqdm(dataset_id_list, desc="Processing datasets"):
    # Fetch and prepare the dataset
    dataset_df, target_name, dataset_name = fetch_open_ml_data(dataset_id)
    X, y = prepare_data(dataset_df, target_name)

    # Step 1: Generate dataset meta parameters
    dataset_parameters = calculate_dataset_meta_parameters(X, y)
    if verbose:
        print(f'\nDataset id {dataset_id}, Dataset properties: {dataset_parameters}')
    dataset_parameters['Dataset'] = dataset_id
    dataset_parameters['DatasetName'] = dataset_name
    dataset_features_list.append(dataset_parameters)

    # Save dataset parameters to file
    pd.to_pickle(dataset_parameters, dataset_features_file)

df_dataset_features_realworld = pd.DataFrame(dataset_features_list)

df_dataset_features_realworld.to_pickle(os.path.join(output_root, 'dataset_features_all.pkl'))
df_dataset_features_realworld.to_csv(f"{output_root}realworld_dataset_features.csv", index=False)

In [4]:
## Build knowledge base


N_SEEDS = 50
N_TRIALS = 25

dataset_id_list = [
    31, 38, 44, 52, 151, 179, 298, 846, 917,
    1049, 1053, 1111, 1112, 1120, 1128, 1220, 1464,
    1494, 1510, 1558, 4534, 23381, 40536, 40900, 45038
]

verbose = False
checkpoint_dir = os.path.join(output_root, 'checkpoint')
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize lists to store the results
dataset_features_list = []
optuna_trials_df_list = []

for dataset_id in tqdm(dataset_id_list, desc="Processing datasets"):
    # Define file paths for checkpointing in the checkpoint directory
    dataset_features_file = os.path.join(checkpoint_dir, f'dataset_{dataset_id}_features.pkl')
    optuna_trials_file = os.path.join(checkpoint_dir, f'dataset_{dataset_id}_optuna_trials.pkl')

    # Check if results for this dataset already exist
    if os.path.exists(dataset_features_file) and os.path.exists(optuna_trials_file):
        print(f"Dataset {dataset_id} already processed. Loading results.")
        # Load existing results
        dataset_parameters = pd.read_pickle(dataset_features_file)
        dataset_features_list.append(dataset_parameters)
        optuna_trials = pd.read_pickle(optuna_trials_file)
        optuna_trials_df_list.append(optuna_trials)
        continue

    # Fetch and prepare the dataset
    dataset_df, target_name, dataset_name = fetch_open_ml_data(dataset_id)
    X, y = prepare_data(dataset_df, target_name)

    # Step 1: Generate dataset meta parameters
    dataset_parameters = calculate_dataset_meta_parameters(X, y)
    if verbose:
        print(f'\nDataset id {dataset_id}, Dataset properties: {dataset_parameters}')
    dataset_parameters['Dataset'] = dataset_id
    dataset_parameters['DatasetName'] = dataset_name
    dataset_features_list.append(dataset_parameters)

    # Save dataset parameters to file
    pd.to_pickle(dataset_parameters, dataset_features_file)

    # Step 2: Perform Optuna HPO - only 'n_samples', 'n_features' are really required for fractional HP representation
    dataset_features = ['n_samples', 'n_features', 'n_highly_target_corr']

    # Process seeds individually with checkpointing
    per_seed_results = []
    for seed in range(N_SEEDS):
        seed_trials_file = os.path.join(checkpoint_dir, f'dataset_{dataset_id}_seed_{seed}_optuna_trials.pkl')
        if os.path.exists(seed_trials_file):
            print(f"Dataset {dataset_id}, Seed {seed} already processed. Loading results.")
            seed_trials = pd.read_pickle(seed_trials_file)
        else:
            # Run Optuna HPO for the current seed
            optuna_output = optuna_perf_wrapper_inc_seeds(
                X, y, dataset_features,
                n_trials=N_TRIALS, n_seeds=1, seed=seed
            )
            seed_trials = optuna_output["combined_trials_df"]
            seed_trials['Dataset'] = dataset_id
            seed_trials['Seed'] = seed
            # Save individual seed results
            pd.to_pickle(seed_trials, seed_trials_file)
        per_seed_results.append(seed_trials)

    # Combine seed results for the current dataset
    optuna_trials = pd.concat(per_seed_results, ignore_index=True)
    optuna_trials_df_list.append(optuna_trials)
    # Save combined Optuna trials for the dataset
    pd.to_pickle(optuna_trials, optuna_trials_file)

# Combine all dataset features and Optuna trials
df_dataset_features_realworld = pd.DataFrame(dataset_features_list)
df_optuna_trials_all_realworld = pd.concat(optuna_trials_df_list, ignore_index=True)

# Save the combined dataset features and Optuna trials
os.makedirs(output_root, exist_ok=True)
df_dataset_features_realworld.to_pickle(os.path.join(output_root, 'dataset_features_all.pkl'))
df_optuna_trials_all_realworld.to_pickle(os.path.join(output_root, 'optuna_trials_all.pkl'))

# Save csv results
df_dataset_features_realworld.to_csv(f"{output_root}realworld_dataset_features.csv", index=False)
df_optuna_trials_all_realworld.to_csv(f"{output_root}realworld_kb_optuna_trials_all.csv", index=False)

In [4]:
## Train ZeroTune Model with recursive feature selection to automactically select the best meta parameters.

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

# Define target hyperparameters and dataset features
targets = ['params_max_depth', 'params_max_features', 'params_min_samples_leaf', 'params_min_samples_split']
dataset_features = df_dataset_features.columns.difference(['Dataset'])

training_dataset_list = [
31, 38, 44, 52, 151, 179, 298, 846, 917,
1049, 1053, 1111, 1112, 1120, 1128, 1220, 1464,
1494, 1510, 1558, 4534, 23381, 40536, 40900, 45038
]

print(training_dataset_list)

df_kb_optuna_trials['rank'] = df_kb_optuna_trials.groupby('Dataset')['value'].rank(method='dense', ascending=False)
df = pd.merge(df_kb_optuna_trials, df_dataset_features_realworld, on='Dataset', how='inner')
train_df = df[df['Dataset'].isin(training_dataset_list)]

print(f'Number of unique datasets used for model training: {len(train_df.Dataset.unique())}')
print(f'Number of data points used for model training: {len(train_df)}')

# Feature matrix (X) and target matrix (y)
X = train_df[dataset_features]
y = train_df[targets]

# Function to perform RFE with hyperparameter tuning
def select_best_features_with_tuning(X, y):
    # Hyperparameter tuning for RandomForestRegressor
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    base_model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)

    # Best model from the grid search
    best_model = grid_search.best_estimator_

    # RFECV with the tuned model using KFold cross-validation
    rfecv = RFECV(estimator=best_model, step=1, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')

    pipeline = Pipeline([
        ('feature_selection', rfecv),
        ('regressor', best_model)
    ])

    pipeline.fit(X, y)
    best_features = X.columns[rfecv.support_]
    return best_features, rfecv.cv_results_['mean_test_score']

# Select the best features
best_features, scores = select_best_features_with_tuning(X, y)

print('Best features for the multi-output regression model:')
print(best_features)

print('Cross-validation scores:')
print(scores)


targets = ['params_max_depth', 'params_max_features', 'params_min_samples_leaf', 'params_min_samples_split']
model, error_score = train_zeroshot_hpo(train_df, best_features, targets, condition_column='Dataset')

# Save ZeroTune model
joblib.dump(model, f"{output_root}pretrained_models/ZeroTune_{ml_config['name']}_{KB_TYPE}_kb.joblib")