In [None]:
# Enable or disable GPU

# #os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

In [None]:
# import shutil
# import os
# directory = r'HFE_GA_experiments'

# if os.path.exists(directory):
#     shutil.rmtree(directory)
#     print(f"The directory '{directory}' has been successfully removed.")
# else:
#     print(f"The directory '{directory}' does not exist.")


In [None]:
import ipywidgets as ipw
output = ipw.Output()

#### Setup logs

In [None]:
import logging
from ml_grid.util.logger_setup import setup_logger

# Set up a basic logger for the initial cells
logger = setup_logger()

mpl_logger = logging.getLogger('matplotlib.font_manager')

# Set the logging level to suppress debug messages
mpl_logger.setLevel(logging.INFO)

In [None]:
import os

# Function to recursively remove a directory
def remove_directory(path):
    if os.path.exists(path):
        for root, dirs, files in os.walk(path, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir(path)
        logger.info(f"Directory '{path}' removed successfully.")
    else:
        logger.info(f"Directory '{path}' does not exist.")

# Specify the directory path to be removed
directory_path = 'HFE_GA_experiments'

# Remove the directory
remove_directory(directory_path)


In [None]:
"""Orchestrates a GA grid search and evaluates ensemble models.

This script manages a complete pipeline for conducting a genetic algorithm (GA)
grid search using the `ml_grid` framework. It automates feature selection,
hyperparameter optimization, and the evaluation of various ensemble machine
learning models for reproducible experiments.

Attributes:
    project_directory (str): The global path for saving all experiment outputs.
    input_csv_path (str): The file path for the input dataset (CSV format).
    n_iter (int): The total number of grid search iterations to perform.
    modelFuncList (list): A list of model generator functions that serve as the
        base learners for the ensemble methods.

Workflow:
    1.  **Initialization**:
        - Imports necessary modules, including model generators (e.g., logistic
          regression, random forest, XGBoost), utilities, and logging tools.
        - Establishes a global project directory and creates a unique,
          timestamped subdirectory for the current experiment run.
        - Initializes a logger to capture and save experiment logs.

    2.  **Configuration**:
        - Sets the path to the input dataset CSV file.
        - Initializes a `project_score_save_class` instance to log the
          scores of each experiment iteration to a central CSV file.
        - Defines the list of base learners (`modelFuncList`) and the number
          of grid search iterations (`n_iter`).
        - Instantiates a grid iterator (`grid_iter_obj`) to supply
          hyperparameter combinations for each run.

    3.  **Main Experiment Loop**:
        - Iterates through the specified number of grid search trials.
        - In each iteration, it fetches a new set of hyperparameters.
        - An `ml_grid_object` is created to handle data loading, preprocessing,
          and overall experiment configuration for the current trial.
        - The core genetic algorithm pipeline (`main_ga.run().execute()`) is
          executed to:
            - Evolve ensembles of base learners.
            - Evaluate each ensemble against the dataset.
            - Log performance metrics and configurations to the experiment
              directory.

Key Features:
    - Supports a wide range of base learners for flexible ensemble creation.
    - Fully automates the hyperparameter search and model evaluation process.
    - Organizes all results, logs, and artifacts in a structured,
      timestamped directory to ensure reproducibility.
    - Easily adaptable for both synthetic and real-world datasets.
    - Modular design allows for the simple addition of new models or search
      parameters.

This pipeline is ideal for large-scale, systematic benchmarking of ensemble
methods and GA-based feature selection strategies.
"""

In [None]:
import ml_grid
import pathlib
import datetime
from tqdm import tqdm
from ml_grid.model_classes_ga.dummy_model import DummyModelGenerator
from ml_grid.util import grid_param_space_ga
import pandas as pd

from ml_grid.util.project_score_save import project_score_save_class
from ml_grid.util.global_params import global_parameters

# Initialize global parameters from config.yml
global_params = global_parameters(config_path='config.yml')

# 2. Create a unique, timestamped directory for this experiment run.
#    This builds upon the base directory from your config.
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run_specific_dir = os.path.join(global_params.base_project_dir, timestamp)
pathlib.Path(run_specific_dir).mkdir(parents=True, exist_ok=True)

# 3. Set up the logger to log into this specific run's directory.
logger = setup_logger(log_folder_path=run_specific_dir)

logger.info(f"Experiment outputs will be saved in: {run_specific_dir}")

# 4. Update the global_params object to use this new directory for all outputs.
#    This ensures all subsequent file-saving operations (like logs and models)
#    go into the correct, unique folder.
global_params.base_project_dir = run_specific_dir

# init csv to store each local project's results
project_score_save_class(global_params.base_project_dir)

grid = grid_param_space_ga.Grid(
    global_params=global_params,
    test_grid=global_params.testing,
    config_path='config.yml'
)
grid_iter_obj = grid.settings_list_iterator

for i in tqdm(range(0, global_params.n_iter)):
    output.clear_output(wait=True)

    # get settings from iterator over grid of settings space
    local_param_dict = next(grid_iter_obj)

    # Pass the list of model classes to the pipeline
    config_dict = {"modelFuncList": global_params.model_list}

    # create object from settings
    ml_grid_object = ml_grid.pipeline.data.pipe(
        # input_csv_path: Path to the input CSV file
        file_name=global_params.input_csv_path,
        
        global_params= global_params,
        # drop_term_list: List of terms to drop from the data if found in columns (default: empty list)
        drop_term_list=[],
        # local_param_dict: Dictionary of local parameters (e.g. hyperparameters)
        local_param_dict=local_param_dict,
        # base_project_dir: Base directory of the project
        base_project_dir=global_params.base_project_dir,
        # additional_naming: Additional naming convention for the output files
        additional_naming='',
        # test_sample_n: Number of samples to use for testing (default: 0, i.e. no testing)
        test_sample_n=global_params.test_sample_n,
        # column_sample_n: Number of columns to sample from the data (default: 0, i.e. all columns)
        column_sample_n=global_params.column_sample_n,
        # param_space_index: Index of the parameter space to use (e.g. for hyperparameter tuning)
        param_space_index=i,
        # config_dict: Dictionary of configuration settings
        config_dict=config_dict,
        # testing: Flag to indicate whether to use the test grid (default: False)
        testing=global_params.testing,  # use smaller test grid for GA params
        # multiprocessing_ensemble: Flag to indicate whether to use multiprocessing for ensemble methods (default: False)
        multiprocessing_ensemble=False
)

    ml_grid_object.verbose = 0

    dummy_generator = DummyModelGenerator(ml_grid_object, local_param_dict)

    from ml_grid.pipeline import main_ga

    # pass object to be evaluated and write results to csv
    res = main_ga.run(ml_grid_object, global_params=global_params,local_param_dict=local_param_dict).execute()

In [None]:
#from ml_grid.util.logger_setup import restore_stdout

#restore_stdout()

In [None]:
pd.set_option("display.max_columns", None)


In [None]:
df = pd.read_csv(ml_grid_object.base_project_dir + "final_grid_score_log.csv")

df

In [None]:
# The original_feature_names should come from the ml_grid_object of the last run\n
original_feature_names = ml_grid_object.original_feature_names

In [None]:
from ml_grid.util.GA_results_explorer import GA_results_explorer

In [None]:
df

In [None]:
df = pd.read_csv(ml_grid_object.base_project_dir + "final_grid_score_log.csv")

df

GA_results_explorer = GA_results_explorer(df, original_feature_names)

GA_results_explorer

In [None]:
GA_results_explorer.plot_feature_cooccurrence(performance_metric='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_performance_vs_size(performance_metric='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_algorithm_distribution_in_ensembles(plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_feature_stability(performance_metric='auc', top_percent=10.0, feature_type='base_learner', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_all_convergence(history_column='generation_progress_list', performance_metric='auc', highlight_best=True, plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_interaction_heatmap(param1='pop_val', param2='run_time', performance_metric='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_performance_tradeoff(performance_metric='auc', cost_metric='run_time', hue_parameter='pop_val', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_ensemble_feature_diversity(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_parameter_distributions(param_type='run_details', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_parameter_distributions(param_type='config', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_base_learner_feature_importance(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_initial_feature_importance(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_combined_anova_feature_importances(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_run_details_anova_feature_importances(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
GA_results_explorer.plot_config_anova_feature_importances(outcome_variable='auc', plot_dir = ml_grid_object.base_project_dir)

In [None]:
assert len(df) > 0

In [None]:
if __name__ == "__main__":
    from ml_grid.util.evaluate_ensemble_methods import EnsembleEvaluator
    import pandas as pd

    # Set your paths and parameters
    input_csv_path = global_params.input_csv_path  # Path to your input data CSV
    results_csv_path = ml_grid_object.base_project_dir + "final_grid_score_log.csv"  # Path to your results DataFrame (CSV or PKL)
    outcome_variable = "outcome_var_1"
    initial_param_dict = {"resample": None}

    try:
        evaluator = EnsembleEvaluator(
            input_csv_path=input_csv_path,
            outcome_variable=outcome_variable,
            initial_param_dict=initial_param_dict,
            debug=False
        )

        weighting_methods_to_test = ["unweighted", "de", "ann"]

        # Load results DataFrame (try CSV, fallback to pickle)
        try:
            if results_csv_path.endswith(".csv"):
                results_df = pd.read_csv(results_csv_path)
            elif results_csv_path.endswith(".pkl"):
                results_df = pd.read_pickle(results_csv_path)
        except Exception as e:
            logger.error(f"Could not load results DataFrame: {e}")
            raise

        test_results_df = evaluator.evaluate_on_test_set_from_df(
            results_df, weighting_methods_to_test
        )
        logger.info("\n--- Results on TEST SET ---")
        if not test_results_df.empty:
            display(test_results_df)

        validation_results_df = evaluator.validate_on_holdout_set_from_df(
            results_df, weighting_methods_to_test
        )
        logger.info("\n--- Results on VALIDATION (HOLD-OUT) SET ---")
        if not validation_results_df.empty:
            display(validation_results_df)

    except (FileNotFoundError, ImportError) as e:
        logger.critical(f"\nExecution stopped due to a critical error: {e}")
    except Exception as e:
        logger.error(f"\nAn unexpected error occurred during the evaluation process: {e}")

In [None]:
pd.read_csv(ml_grid_object.base_project_dir + "final_grid_score_log.csv")

In [None]:
df

In [None]:
df.info()