In [None]:
import pandas as pd
import logging
from IPython.display import display

# --- Make sure your notebook's working directory is the project root ---
# (e.g., /home/samorah/_data/ml_binary_classification_gridsearch_hyperOpt/)
from pathlib import Path
# so that the imports work correctly.
from ml_grid.util.synthetic_data_generator import generate_synthetic_data
from ml_grid.util.impute_data_for_pipe import save_missing_percentage, mean_impute_dataframe

# --- 1. Setup Logging ---
# This allows you to see the informative output from the generator and other steps.
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# --- 2. Generate Synthetic Data and the Ground-Truth Map ---
logging.info("Generating a sample synthetic dataset using the importable function...")
synthetic_df, important_feature_map = generate_synthetic_data(
    n_rows=200,
    n_features=30,
    n_outcome_vars=3,
    feature_strength=0.7,
    percent_important_features=0.2,
    verbose=True,
)

print("\n--- Generated DataFrame Info (Before Imputation) ---")
print(f"Shape: {synthetic_df.shape}")
print(f"Total NaNs present: {synthetic_df.isnull().sum().sum()}")
print("Sample of data with missing values:")
display(synthetic_df.head())
print("----------------------------------------------------")


# --- 3. Calculate and Save the Percentage of Missing Values ---
missing_pickle_filename = "percent_missing_synthetic_data_generated.pkl"
print(f"\nCalculating missing value percentages and saving to '{missing_pickle_filename}'...")
save_missing_percentage(synthetic_df, output_file=missing_pickle_filename)
print("✅ Missing value pickle file saved.")


# --- 4. Perform Mean Imputation ---
print("\nPerforming mean imputation on the dataset...")
# Get the list of outcome columns to exclude them from imputation
outcome_columns = list(important_feature_map.keys())
imputed_df = mean_impute_dataframe(data=synthetic_df.copy(), y_vars=outcome_columns)
print(f"Imputation complete. NaNs present after imputation: {imputed_df.isnull().sum().sum()}")
print("✅ Mean imputation successful.")


# --- 5. Save the Imputed Data to the Final CSV File ---
output_csv_filename = "synthetic_data_generated.csv"
imputed_df.to_csv(output_csv_filename, index=False)
print(f"\nImputed data saved to '{output_csv_filename}'")
print("✅ Final CSV file saved.")

print("\n--- Final Imputed DataFrame ---")
display(imputed_df.head())
print("-------------------------------")



## Load Global Configuration

Load `config_hyperopt.yml` once at the beginning to ensure all subsequent cells can access global settings and paths.

In [None]:
import yaml

CONFIG_HYPEROPT_PATH = Path("../config_hyperopt.yml")

if not CONFIG_HYPEROPT_PATH.exists():
    raise FileNotFoundError(f"Hyperopt configuration file not found at {CONFIG_HYPEROPT_PATH.resolve()}")

with open(CONFIG_HYPEROPT_PATH, 'r') as f:
    config = yaml.safe_load(f) # Use 'config' as the main variable for the search
print("  Hyperopt configuration loaded successfully.")

# Define project root (assuming notebook is run from project root)
project_root = Path.cwd()
print(f"Project root set to: {project_root}")

In [None]:
"""
Minimal standalone script to instantiate and test the ml_grid.pipeline.data.pipe class.

This script provides a clear example of the minimum setup required to create
an `ml_grid_object`. It is intended for debugging the data pipeline in isolation.
"""

import shutil
from pathlib import Path
import os
import yaml

# --- Essential imports from your ml_grid project ---
# Ensure your project is installed (e.g., `pip install -e .`) or the path is configured
# so these imports can be found.
try:
    from ml_grid.pipeline.data import pipe
    from ml_grid.util.global_params import global_parameters
    from ml_grid.util.create_experiment_directory import create_experiment_directory
except ImportError as e:
    print("Could not import ml_grid components.")
    print("Please ensure you are running this script from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    exit()

# =============================================================================
# 1. LOAD CONFIGURATION
# =============================================================================
print("1. Loading configuration from config.yml...")
CONFIG_PATH = project_root / "../config_single_run.yml"

if not CONFIG_PATH.exists():
    raise FileNotFoundError(f"Configuration file not found at {CONFIG_PATH.resolve()}")

with open(CONFIG_PATH, 'r') as f:
    config_single_run = yaml.safe_load(f)
print("  Configuration loaded successfully.")

# =============================================================================
# 2. SETUP ENVIRONMENT & PATHS FROM CONFIG
# =============================================================================
print("\n2. Setting up environment and paths...")

# Project root is the current working directory
# project_root is already defined from config_hyperopt.yml loading
base_project_dir = str(project_root)
print(f"  Project Root: {project_root}")

# Load paths from config
input_csv_path = project_root / config_single_run['data']['file_path']
if not input_csv_path.exists():
    print(f"  ERROR: Data file not found at '{input_csv_path}'")
    print("  Please make sure the path is correct.")
    exit()
print(f"  Input CSV: {input_csv_path.resolve()}")

# Create experiment directory from config
experiments_base_dir = project_root / config_single_run['experiment']['experiments_base_dir']
experiment_dir = create_experiment_directory(
    base_dir=experiments_base_dir,
    additional_naming=config_single_run['experiment']['additional_naming']
)
experiment_dir = Path(experiment_dir) # Ensure it's a Path object
print(f"  Experiment Directory: {experiment_dir}")

# Configure global parameters from config
global_parameters.verbose = config_single_run['global_params']['verbose']
global_parameters.error_raise = config_single_run['global_params']['error_raise']

# =============================================================================
# 3. DEFINE PIPELINE PARAMETERS FROM CONFIG
# =============================================================================
print("\n3. Defining pipeline parameters...")

drop_term_list = config_single_run['data']['drop_term_list']
print(f"  Drop terms: {drop_term_list}")

model_class_dict = config_single_run['models']
print(f"  Enabled models: {[k for k, v in model_class_dict.items() if v]}")

outcome_var = config_single_run['data'].get('outcome_var_override') # Use .get() for safety
print(f"  Outcome variable override: '{outcome_var}'")

local_param_dict = config_single_run['run_params']
print("  Local parameter dictionary configured.")

# A unique index for this parameter combination (useful when iterating)
param_space_index = 0


# =============================================================================
# 4. INSTANTIATE THE PIPE CLASS
# =============================================================================
print("\n4. Instantiating the 'pipe' class...")

try:
    # This is the call to create the ml_grid_object.
    # The entire data pipeline runs during this initialization.
    ml_grid_object = pipe(
        file_name=str(input_csv_path.resolve()),
        drop_term_list=drop_term_list,
        local_param_dict=local_param_dict,
        base_project_dir=base_project_dir,
        experiment_dir=experiment_dir,
        test_sample_n=0,  # Use 0 to process the full dataset
        param_space_index=param_space_index,
        model_class_dict=model_class_dict,
        outcome_var_override=outcome_var
    )

    print("\n" + "="*50)
    print("SUCCESS: 'ml_grid_object' created successfully.")
    print("="*50)

    # =========================================================================
    # 5. INSPECT THE RESULTS
    # =========================================================================
    print("\n5. Inspecting the final object attributes:")
    print(f"  - Outcome Variable Used: {ml_grid_object.outcome_variable}")
    print(f"  - X_train shape: {ml_grid_object.X_train.shape}")
    print(f"  - y_train shape: {ml_grid_object.y_train.shape}")
    print(f"  - X_test shape: {ml_grid_object.X_test.shape}")
    print(f"  - y_test shape: {ml_grid_object.y_test.shape}")
    print(f"  - Number of final features: {len(ml_grid_object.final_column_list)}")
    print(f"  - Number of available models: {len(ml_grid_object.model_class_list)}")
    print("\nFinal X_train columns sample:")
    print(ml_grid_object.X_train.columns.to_list()[:10])


except Exception as e:
    print("\n" + "="*50)
    print("ERROR: Failed to instantiate the 'pipe' class.")
    print("="*50)
    import traceback
    print(f"\nAn error of type '{type(e).__name__}' occurred: {e}")
    print("\nFull Traceback:")
    traceback.print_exc()
    



In [None]:
ml_grid_object.X_train.head()

In [None]:
ml_grid_object.X_test.head()

In [None]:
import traceback
from typing import Any, Dict, List, Tuple

print("\n4. Initializing and executing the model training run...")

# --- Essential imports ---
# Make sure your project is installed or the path is configured.
try:
    from ml_grid.pipeline.main import run
    from ml_grid.util.global_params import global_parameters
except ImportError as e:
    print("Could not import ml_grid components for the 'run' step.")
    print("Please ensure you are running this from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    # Use exit() if running as a script, or just let the error show in a notebook.
    # exit() 

# Check if ml_grid_object exists from the previous cell's execution
if 'ml_grid_object' not in locals() or 'local_param_dict' not in locals():
    print("\nERROR: 'ml_grid_object' or 'local_param_dict' not found.")
    print("Please ensure you have successfully run the previous cell (the 'pipe' instantiation script) first.")
else:
    # REMOVED the top-level try...except block. Any exception will now halt the cell.
    
    print("  Instantiating the 'run' class with the data object...")
    # Instantiate the 'run' class with the object from the data pipeline
    run_instance = run(
        ml_grid_object=ml_grid_object,
        local_param_dict=local_param_dict
    )

    print("  Executing the hyperparameter search and cross-validation...")
    # Execute the hyperparameter search and cross-validation for all models.
    # If an error occurs here, it will stop the notebook execution.
    model_errors, highest_score = run_instance.execute()

    print("\n" + "="*50)
    print("SUCCESS: Model training and evaluation complete.")
    print("="*50)

    print("\n5. Inspecting the training results:")
    print(f"  - Highest score achieved across all models: {highest_score:.4f}")

    if model_errors:
        print(f"\n  - {len(model_errors)} model(s) encountered errors during training:")
        for i, error_info in enumerate(model_errors):
            try:
                # Try to get a meaningful name for the model
                model_name = error_info[0].__class__.__name__
            except:
                model_name = "Unknown Model"
            error_exception = error_info[1]
            print(f"    {i+1}. Model: {model_name}, Error: {error_exception}")
    else:
        print("\n  - All configured models ran without critical errors.")



In [None]:
# Clean up test results directory
!rm -r 'experiments'

In [None]:
from hyperopt import hp
import yaml
from pathlib import Path

# =============================================================================
# LOAD HYPEROPT CONFIGURATION
# =============================================================================
print("Loading configuration for Hyperopt search from config_hyperopt.yml...")
CONFIG_HYPEROPT_PATH = project_root / "../config_hyperopt.yml"

if not CONFIG_HYPEROPT_PATH.exists():
    raise FileNotFoundError(f"Hyperopt configuration file not found at {CONFIG_HYPEROPT_PATH.resolve()}")

with open(CONFIG_HYPEROPT_PATH, 'r') as f:
    config = yaml.safe_load(f) # Use 'config' as the main variable for the search
print("  Hyperopt configuration loaded successfully.")

# =============================================================================
# BUILD HYPEROPT SEARCH SPACE FROM CONFIG
# =============================================================================
def build_hyperopt_space(config_space):
    """Dynamically builds a hyperopt search space from the config dictionary."""
    space = {}
    # The 'data' key is a nested dictionary of choices
    space['data'] = {k: hp.choice(k, v) for k, v in config_space.get('data', {}).items()}
    
    # All other keys are simple choices, excluding the settings key
    for key, value in config_space.items():
        # The key 'corr' in old notebooks is now 'correlation_threshold'
        # This ensures backward compatibility if an old space is being tested.
        if key == 'corr':
            space['correlation_threshold'] = hp.choice('correlation_threshold', value)
        elif key not in ['data', 'hyperopt_settings']:
            space[key] = hp.choice(key, value)
            
    return space

# Build the hyperopt space object
space = build_hyperopt_space(config['hyperopt_search_space'])

print("\nHyperopt search space built from config:")
print(space)

# =============================================================================
# LOAD OTHER PARAMETERS FROM CONFIG
# =============================================================================
model_class_dict = config['models']
print("\nModels to be used in Hyperopt search:")
print([model for model, enabled in model_class_dict.items() if enabled])

# Define project root and construct absolute paths
# project_root is already defined from config_hyperopt.yml loading
base_project_dir = project_root / config['experiment']['experiments_base_dir']
experiment_name = config['experiment']['additional_naming']
multiple_outcomes_example = config['data']['multiple_outcomes']
input_csv_path = project_root / config['data']['file_path']
drop_term_list = config['data']['drop_term_list']
max_evals = config['hyperopt_settings']['max_evals']

print(f"\nExperiment settings:")
print(f"  Base Directory: {base_project_dir.resolve()}")
print(f"  Experiment Name: {experiment_name}")
print(f"  Input CSV: {input_csv_path.resolve()}")
print(f"  Multiple Outcomes: {multiple_outcomes_example}")
print(f"  Max Evals per Outcome: {max_evals}")

In [None]:
import os
import shutil
import time
import traceback
import warnings
from typing import Any, Dict, List, Union
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import clear_output, display
from numpy import absolute, mean, std
from pandas.testing import assert_index_equal
from sklearn import metrics
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_validate,
)
from sklearn.preprocessing import MinMaxScaler
from xgboost.core import XGBoostError

# --- Essential imports from your ml_grid project ---
# Ensure your project is installed or the path is configured.
try:
    from ml_grid.pipeline.data import pipe
    from ml_grid.pipeline.hyperparameter_search import HyperparameterSearch
    from ml_grid.util.bayes_utils import calculate_combinations
    from ml_grid.util.create_experiment_directory import create_experiment_directory
    from ml_grid.util.global_params import global_parameters
    from ml_grid.util.project_score_save import project_score_save_class
    from ml_grid.util.validate_parameters import validate_parameters_helper
except ImportError as e:
    print("Could not import ml_grid components.")
    print("Please ensure you are running this from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    # Use exit() if running as a script, or just let the error show in a notebook.
    # exit()

# =============================================================================
# STANDALONE SCRIPT TO DEBUG `grid_search_crossvalidate` INTERNALS
# =============================================================================

print(f"   Project Root: {base_project_dir}")

# --- Define the path to your input data ---
# IMPORTANT: Update this path to the correct location of your file.
# Now relative to the project_root
input_csv_path = project_root / "synthetic_data_generated.csv"

if not input_csv_path.exists():
    print(f"   ERROR: Data file not found at '{input_csv_path.resolve()}'")
    print("   Please make sure the path is correct.")
    # exit()
else:
    print(f"   Input CSV: {input_csv_path.resolve()}")

# Create a directory for this specific experiment run's logs and outputs
experiments_base_dir = project_root / config['experiment']['experiments_base_dir']
experiment_dir = create_experiment_directory(
    base_dir=experiments_base_dir,
    additional_naming="GSCV_Internals_Debug"
)
experiment_dir = Path(experiment_dir) # Ensure it's a Path object
print(f"   Experiment Directory: {experiment_dir.resolve()}")


# --- 2. Configure Parameters ---
print("\n2. Configuring parameters...")
# Global parameters
global_parameters.verbose = 1
global_parameters.error_raise = True
global_parameters.bayessearch = False
global_parameters.random_grid_search = True
global_parameters.sub_sample_param_space_pct = 0.2

# Local parameters for the data pipeline, configured for your dataset
local_param_dict = {
    'outcome_var_n': 1,
    'param_space_size': 'xsmall',
    'scale': True,
    'feature_n': 90,
    'use_embedding': False,
    'percent_missing': 98,
    'correlation_threshold': 0.95,
    'test_size': 0.2,
    'random_state': 42,
    'data': {
        'age': True, 'sex': True, 'bmi': True, 'ethnicity': True,
        'bloods': True, 'diagnostic_order': True, 'drug_order': True,
        'annotation_n': True, 'meta_sp_annotation_n': True,
        'annotation_mrc_n': True, 'meta_sp_annotation_mrc_n': True,
        'core_02': True, 'bed': True, 'vte_status': True,
        'hosp_site': True, 'core_resus': True, 'news': True,
        'date_time_stamp': False, 'appointments': False,
    }
}
print("   Parameters configured.")

# --- Main Execution Block ---
ml_grid_object = None
try:
    # --- 3. Run Data Pipeline to Get `ml_grid_object` ---
    print("\n3. Initializing data pipeline (`pipe`) to prepare data...")
    ml_grid_object = pipe(
        file_name=str(input_csv_path),
        drop_term_list=config['data']['drop_term_list'],
        experiment_dir=str(experiment_dir),
        base_project_dir=base_project_dir,
        local_param_dict=local_param_dict,
        param_space_index=0,
        model_class_dict=config['models'],
        outcome_var_override='outcome_var_1'
    )
    print("   Data pipeline finished.")

    # =========================================================================
    # --- 4. EXPLICIT `grid_search_crossvalidate` INTERNAL LOGIC ---
    # =========================================================================
    print("\n4. Executing `grid_search_crossvalidate` internal logic...")
    start_time_main = time.time()

    # --- 4a. Select a model and extract its properties ---
    model_to_test = ml_grid_object.model_class_list[0]
    algorithm_implementation = model_to_test.algorithm_implementation
    parameter_space = model_to_test.parameter_space
    method_name = model_to_test.method_name
    print(f"   - Model for debugging: {method_name}")

    # --- 4b. Initialize variables from `grid_search_crossvalidate.__init__` ---
    # CORRECTED: Set each warning filter individually.
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    
    grid_n_jobs = global_parameters.grid_n_jobs
    if "keras" in method_name.lower() or "xgb" in method_name.lower() or "catboost" in method_name.lower():
        grid_n_jobs = 1
        try:
            gpu_devices = tf.config.experimental.list_physical_devices("GPU")
            for device in gpu_devices:
                tf.config.experimental.set_memory_growth(device, True)
        except Exception as e:
            print(f"   - Could not configure GPU for TensorFlow: {e}")

    # Extract data from the ml_grid_object
    X_train, y_train = ml_grid_object.X_train, ml_grid_object.y_train
    X_test, y_test = ml_grid_object.X_test, ml_grid_object.y_test

    # --- 4c. Prepare for Hyperparameter Search ---
    max_param_space_iter_value = global_parameters.max_param_space_iter_value
    param_grid_size = len(ParameterGrid(parameter_space))
    sub_sample_parameter_val = int(global_parameters.sub_sample_param_space_pct * param_grid_size)
    n_iter_v = max(2, sub_sample_parameter_val)
    n_iter_v = min(n_iter_v, max_param_space_iter_value)
    print(f"   - Hyperparameter search iterations (n_iter): {n_iter_v}")

    # Instantiate the HyperparameterSearch class
    search = HyperparameterSearch(
        algorithm=algorithm_implementation,
        parameter_space=parameter_space,
        method_name=method_name,
        global_params=global_parameters,
        max_iter=n_iter_v,
        ml_grid_object=ml_grid_object
    )

    # --- 4d. Run the Hyperparameter Search ---
    print("   - Running HyperparameterSearch.run_search()...")
    # This is the core search step (e.g., RandomizedSearchCV.fit())
    best_estimator = search.run_search(X_train, y_train)
    print(f"   - Best estimator found: {best_estimator.get_params()}")

    # --- 4e. Fit the Final Model and Evaluate ---
    print("   - Fitting the best estimator on the full training data...")
    # Use numpy arrays for final fitting
    best_estimator.fit(X_train.values, y_train.values)

    # --- 4f. Run Cross-Validation on the Best Model ---
    print("   - Running cross_validate on the best estimator...")
    cv_splitter = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)
    
    try:
        scores = cross_validate(
            best_estimator,
            X_train.values,
            y_train.values,
            scoring=global_parameters.metric_list,
            cv=cv_splitter,
            n_jobs=grid_n_jobs,
            error_score='raise'
        )
        failed = False
        print("   - Cross-validation successful.")
        for metric, values in scores.items():
            print(f"     - {metric}: {np.mean(values):.4f}")
            
    except Exception as e:
        print(f"   - Cross-validation failed: {e}")
        scores = {metric: [0.5] for metric in global_parameters.metric_list} # Default scores
        failed = True

    # --- 4g. Predict on the Test Set and Calculate Final Score ---
    print("   - Predicting on the test set...")
    # Use .values to ensure numpy array for prediction
    best_pred_orig = best_estimator.predict(X_test.values)
    
    # The final score to be optimized/reported
    final_auc_score = roc_auc_score(y_test, best_pred_orig)
    
    score_saver = project_score_save_class(experiment_dir=experiment_dir)

    
    # --- 4h. Log the results (emulating project_score_save_class) ---
    score_saver.update_score_log(
        ml_grid_object=ml_grid_object,
        scores=scores,
        best_pred_orig=best_pred_orig,
        current_algorithm=best_estimator,
        method_name=method_name,
        pg=param_grid_size,
        start=start_time_main,
        n_iter_v=n_iter_v,
        failed=failed
    )
    print("   - Results logged.")

    # =========================================================================
    # --- 5. Display the Final Results ---
    # =========================================================================
    print("\n" + "="*60)
    print("SUCCESS: Standalone internal logic run complete.")
    print("="*60)
    print(f"\n   - Model Tested: {method_name}")
    print(f"   - Final Reported AUC Score on Test Set: {final_auc_score:.4f}")
    print(f"   - Total execution time: {time.time() - start_time_main:.2f} seconds")

except Exception as e:
    print("\n" + "="*60)
    print("A CRITICAL ERROR OCCURRED DURING EXECUTION")
    print("="*60)
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {e}")
    print("\nFull Traceback:")
    traceback.print_exc()
    raise e

finally:
    # --- 6. Cleanup ---
    try:
        os.remove('final_grid_score_log.csv')
    
    except FileNotFoundError:
        pass # File might not exist if no runs completed
    except Exception as e:
        print(f"Error during cleanup of final_grid_score_log.csv: {e}")

    try:
        shutil.rmtree(experiments_base_dir)
    except FileNotFoundError:
        pass # Directory might not exist
    except Exception as e:
        print(f"Error during cleanup of experiment directory: {e}")

    try: 
        shutil.rmtree('run_0')
    except FileNotFoundError:
        pass # Directory might not exist
    except Exception as e:
        print(f"Error during cleanup of run_0: {e}")
    print("\n6. Clean up complete.")


In [None]:
from ml_grid.util.global_params import global_parameters

# print all attributes and their values
print(vars(global_parameters))

if global_parameters.debug_level > 1:
        print("Debug Mode: Additional logging enabled.")

# Update global parameters
#global_parameters.update_parameters(debug_level=0, grid_n_jobs = -1, error_raise = True, max_param_space_iter_value=1 )

In [None]:
from ml_grid.util.global_params import global_parameters

#print all attributes and their values

print(vars(global_parameters))

In [None]:
import ml_grid
from pathlib import Path
import datetime
from tqdm import tqdm
import random
from IPython.display import clear_output
import pandas as pd
from hyperopt import STATUS_FAIL
from ml_grid.model_classes.h2o_classifier_class import H2OAutoMLConfig
from ml_grid.util.create_experiment_directory import create_experiment_directory
from ml_grid.util.project_score_save import project_score_save_class
from ml_grid.pipeline.data import NoFeaturesError, pipe
from ml_grid.util.param_space import ParamSpace

random.seed(1234)

# --- Setup Experiment Directory and Logging ---
# This is the main directory for the entire Hyperopt search.
experiment_dir = create_experiment_directory(
    base_dir=base_project_dir, # From cell 7
    additional_naming=experiment_name # From cell 7
)
experiment_dir = Path(experiment_dir)
print(f"Main experiment directory: {experiment_dir.resolve()}")

# Initialize the project-level score log within the main experiment directory
project_score_save_class(experiment_dir)

def objective(local_param_dict, outcome_var=None):
    """The objective function that Hyperopt will minimize."""
    clear_output(wait=True)
    print(f"Evaluating for outcome: {outcome_var}")
    print(f"Params: {local_param_dict}")
    
    # A unique ID for this specific trial
    trial_idx = random.randint(0, 9999999999)

    try:
        ml_grid_object = pipe(
            file_name=str(input_csv_path.resolve()),
            drop_term_list=drop_term_list,
            local_param_dict=local_param_dict,
            base_project_dir=base_project_dir,
            experiment_dir=experiment_dir,  
            test_sample_n=0,
            param_space_index=trial_idx,
            model_class_dict=model_class_dict,
            outcome_var_override=outcome_var
        )

        from ml_grid.pipeline import main
        
        # Run the modeling pipeline and get the best score for this trial
        errors, highest_score = main.run(ml_grid_object, local_param_dict=local_param_dict).execute()
        
        return {
            'loss': 1 - float(highest_score), # Hyperopt minimizes, so we use 1 - AUC
            'status': 'ok'
        }
    except NoFeaturesError as e:
        print(f"Skipping trial due to NoFeaturesError: {e}")
        return {'status': STATUS_FAIL, 'loss': float('inf')}
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()
        raise e
        return {'status': STATUS_FAIL, 'loss': float('inf')}


In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
trials = Trials()

In [None]:
from functools import partial

In [None]:
#%%prun
if( multiple_outcomes_example == False):

    # Fix the additional argument (outcome_var) using partial
    # Define the single outcome to test
    single_outcome_var = 'outcome_var_1'
    objective_with_outcome = partial(objective, outcome_var=single_outcome_var)

    # Initialize Trials object to store results
    trials = Trials()

    # Run the optimization
    best = fmin(
        fn=objective_with_outcome,  # Use the partial function
        space=space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials,
        verbose=1
    )

    print("Best hyperparameters:", best)

In [None]:
if( multiple_outcomes_example == False):
    best

In [None]:
if( multiple_outcomes_example == False):
    # Assuming the log file is in the current experiment_dir
    results_df = pd.read_csv(experiment_dir / 'final_grid_score_log.csv')

In [None]:
if( multiple_outcomes_example == False):
    if 'results_df' in locals():
        display(results_df.sort_values('auc', ascending=False).iloc[0])
    else:
        print("results_df not found. Please run the previous cells.")

In [None]:
if( multiple_outcomes_example == False):
    if 'results_df' in locals():
        display(results_df.sort_values('auc', ascending=False))
    else:
        print("results_df not found. Please run the previous cells.")

In [None]:
if( multiple_outcomes_example == True):
    
    dft = pd.read_csv(input_csv_path.resolve(), nrows=1)
    dft

In [None]:
# get outcome variables by finding prefix "outcome_var_" in column list

if( multiple_outcomes_example == True):
    outcome_var_list = [dft.columns[i] for i in range(len(dft.columns)) if "outcome_var_" in dft.columns[i]]

    outcome_var_list

In [None]:
# %%prun
if multiple_outcomes_example:
    import multiprocessing
    from datetime import datetime
    from functools import partial
    from hyperopt import fmin, tpe, Trials, STATUS_OK, hp
    from joblib import Parallel, delayed
    import traceback
    import sys
    import pandas as pd
    from pathlib import Path

    # --- Imports from your ml_grid project ---
    from ml_grid.pipeline.data import pipe
    from ml_grid.pipeline.main import run
    from ml_grid.util.create_experiment_directory import create_experiment_directory
    # Assuming config_parser is in notebooks/ or a discoverable path
    from config_parser import load_config

    # 1. --- Load Configuration from YAML ---
    # This assumes 'config_hyperopt.yml' is in a location accessible from the notebook
    config = load_config('config_hyperopt.yml')

    # 2. --- Set up Experiment Environment ---
    project_root = Path.cwd() # Or specify the correct project root path
    experiments_base_dir = project_root / config['experiment']['experiments_base_dir']
    experiment_dir = create_experiment_directory(
        base_dir=experiments_base_dir,
        additional_naming=config['experiment']['additional_naming']
    )

    # 3. --- Dynamically Build Hyperopt Search Space from Config ---
    space = {}
    for key, value in config['hyperopt_search_space'].items():
        if key == 'data':
            # Handle nested 'data' dictionary for feature groups
            space['data'] = {k: hp.choice(f'data_{k}', v) for k, v in value.items()}
        else:
            space[key] = hp.choice(key, value)

    # 4. --- Get Hyperopt Settings from Config ---
    max_evals = config['hyperopt_settings']['max_evals']

    # 5. --- Determine Outcome Variables ---
    outcome_var_list = []
    if config['data'].get('multiple_outcomes', False):
        # Use an absolute path if the file is not in the current working directory
        data_file_path = config['data']['file_path']
        if not Path(data_file_path).is_absolute():
            data_file_path = project_root / data_file_path
        
        # Let FileNotFoundError propagate naturally
        df = pd.read_csv(data_file_path)
        outcome_var_list = [col for col in df.columns if 'outcome_var_' in col]
        
        if not outcome_var_list:
            raise ValueError(f"No outcome variables found with 'outcome_var_' prefix in {data_file_path}")
        
        print(f"Found {len(outcome_var_list)} outcome variables to process.")
    else:
        # Use the default from the config if multiple_outcomes is false
        outcome_var_list = config['hyperopt_search_space']['outcome_var_n']

    # 6. --- Define the Objective Function for Hyperopt ---
    def objective(params, outcome_var):
        """
        Objective function for hyperopt. It receives sampled parameters
        and the specific outcome variable for the current run.
        
        NOTE: Exceptions are caught here only to return a failed trial status
        to hyperopt, but critical errors should still halt execution.
        """
        try:
            # The 'params' dict contains the sampled hyperparameters.
            # We use it to build the local_param_dict for the pipeline.
            local_param_dict = params

            # Initialize the data pipeline and the main 'run' class instance.
            # This happens for every trial, which is necessary because the
            # data processing steps (scaling, feature selection, etc.)
            # are part of the hyperparameter search.
            
            # Correctly instantiate the 'pipe' object by unpacking the config
            ml_grid_object = pipe(
                file_name=config['data']['file_path'],
                drop_term_list=config['data']['drop_term_list'],
                model_class_dict=config['models'],
                local_param_dict=local_param_dict,
                base_project_dir=project_root,
                experiment_dir=experiment_dir,
                param_space_index=0, # Index is less relevant for hyperopt
                outcome_var_override=outcome_var
            )

            # The 'run' class's execute() method will loop through all models
            # enabled in the config and evaluate them with the current data pipeline setup.
            run_instance = run(local_param_dict=local_param_dict, ml_grid_object=ml_grid_object)
            
            # The execute method returns the list of errors and the highest score
            # achieved by any model in that trial.
            _, highest_score = run_instance.execute()

            # Hyperopt minimizes 'loss', so we return the negative of the highest score.
            return {'loss': -highest_score, 'status': STATUS_OK, 'params': params}

        except Exception as e:
            # Log the error for debugging
            tb_str = traceback.format_exc()
            print(f"ERROR in objective for {outcome_var} with params {params}: {e}\n{tb_str}", file=sys.stderr)
            
            # Return failure status to hyperopt (this allows it to try other params)
            # For truly critical errors, you could raise instead
            return {'loss': float('inf'), 'status': 'fail', 'message': str(e)}

    # 7. --- Define the Worker Function for Parallel Processing ---
    def process_single_outcome(outcome_var):
        """
        This function is executed by each parallel worker. It sets up the
        objective function for a specific outcome and runs the fmin loop.
        
        NOTE: Exceptions are no longer caught here - they will propagate and halt execution.
        """
        start_time = datetime.now()
        print(f"[{start_time}] Starting optimization for outcome: {outcome_var}", flush=True)

        # Use functools.partial to bind the current outcome_var to the objective function.
        fmin_objective = partial(objective, outcome_var=outcome_var)

        trials = Trials()
        best = fmin(
            fn=fmin_objective,
            space=space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            verbose=0
        )
        
        end_time = datetime.now()
        failed_trials = [t for t in trials.results if t['status'] == 'fail']
        
        print(f"[{end_time}] Finished {outcome_var} (Duration: {end_time - start_time})", flush=True)
        print(f"  -> Best param set for this outcome: {best}", flush=True)
        print(f"  -> Trials summary: {len(failed_trials)}/{len(trials.results)} failed.", flush=True)
        
        return (outcome_var, best)

    # 8. --- Main Execution Block ---
    num_cores = max(1, multiprocessing.cpu_count() - 2)
    start_total = datetime.now()
    print(f"Starting all optimizations at {start_total} using {num_cores} cores.")

    # Parallel will raise exceptions from workers automatically
    results = Parallel(n_jobs=num_cores, verbose=10)(
        delayed(process_single_outcome)(outcome)
        for outcome in outcome_var_list
    )

    # --- Process and display final results ---
    print("\n" + "="*70)
    print("Hyperparameter Optimization Summary")
    print("="*70)
    for outcome_var, best_params in results:
        print(f"\n✅ Success for '{outcome_var}':")
        print(f"   Best parameter combination found: {best_params}")

    end_total = datetime.now()
    print(f"\nCompleted all optimizations at {end_total} (Total duration: {end_total - start_total})")

In [None]:
import pandas as pd 
import os
from datetime import datetime

# Define the parent directory from config
parent_dir = experiment_dir # Use the path from the run

# Check if the CSV is directly in the parent directory first
csv_path = os.path.join(parent_dir, 'final_grid_score_log.csv')

if not os.path.exists(csv_path):
    # If not found directly, look for it in timestamped subfolders
    # List all folders in the parent directory that match the date pattern
    folders = [f for f in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, f))]

    # Parse folder names as dates and find the latest one
    def parse_date(folder_name: str):
        """
        Parses the timestamp from the beginning of a folder name.
        Expected format: 'YYYY-MM-DD_HH-MM-SS_...'.
        """
        try:
            # The timestamp is always the first 19 characters.
            timestamp_part = folder_name[:19]
            return datetime.strptime(timestamp_part, '%Y-%m-%d_%H-%M-%S')
        except (ValueError, IndexError):
            # Return None if the folder name doesn't match the expected format or is too short.
            return None

    # Filter and sort folders by date
    folders_with_dates = [(f, parse_date(f)) for f in folders]
    folders_with_dates = [f for f in folders_with_dates if f[1] is not None]
    
    if folders_with_dates:
        latest_folder = max(folders_with_dates, key=lambda x: x[1])[0]  # Get the folder with the latest date
        print("latest_folder", latest_folder)
        
        # Construct the path to the CSV file in the latest folder
        csv_path = os.path.join(parent_dir, latest_folder, 'final_grid_score_log.csv')
    else:
        raise FileNotFoundError("No timestamped folders found and CSV not in parent directory")
else:
    print("CSV found directly in parent directory")

# Load the CSV file
df = pd.read_csv(csv_path)

# Sort the DataFrame by 'auc' column in descending order
df = df.sort_values(by='auc', ascending=False)

print(f"Total rows: {len(df)}")

# Group by outcome_variable and display the first row of each group with the highest auc
df_best = df.groupby('outcome_variable').apply(lambda x: x.iloc[0])

# Display the result
df_best.head()

In [None]:
df_best['algorithm_implementation'].value_counts()

In [None]:
import pandas as pd
from datetime import datetime
from pathlib import Path

# --- Configuration ---
experiments_base_dir = Path(config['experiment']['experiments_base_dir'])

# --- Find the CSV file (try multiple locations) ---

def find_csv_file():
    """
    Search for final_grid_score_log.csv in multiple locations:
    1. Project root (parent of experiments_base_dir)
    2. Directly in experiments_base_dir
    3. In the latest timestamped subfolder
    """
    # Location 1: Project root (where the notebook is run from)
    project_root = experiments_base_dir.parent
    csv_path = project_root / 'final_grid_score_log.csv'
    if csv_path.exists():
        print(f"✓ CSV found in project root: {csv_path.resolve()}")
        return csv_path
    
    # Location 2: Directly in experiments directory
    csv_path = project_root / experiments_base_dir / 'final_grid_score_log.csv'
    if csv_path.exists():
        print(f"✓ CSV found in experiments directory: {csv_path.resolve()}")
        return csv_path
    
    # Location 3: In latest timestamped subfolder
    latest_folder = find_latest_experiment_folder()
    if latest_folder:
        csv_path = latest_folder / 'final_grid_score_log.csv' # latest_folder is already absolute
        if csv_path.exists():
            print(f"✓ CSV found in latest experiment folder: {csv_path.resolve()}")
            return csv_path
    
    return None


def parse_date(folder_name: str):
    """
    Parses the timestamp from the beginning of a folder name.
    Expected format: 'YYYY-MM-DD_HH-MM-SS_...'.
    """
    try:
        timestamp_part = folder_name[:19]
        return datetime.strptime(timestamp_part, '%Y-%m-%d_%H-%M-%S')
    except (ValueError, IndexError):
        return None


def find_latest_experiment_folder():
    """Find the most recent timestamped experiment folder."""
    if not experiments_base_dir.exists() or not experiments_base_dir.is_dir():
        print(f"⚠ Experiments directory not found: {(project_root / experiments_base_dir).resolve()}")
        return None
    
    subfolders = [f for f in experiments_base_dir.iterdir() if f.is_dir()]
    folders_with_dates = [(f, parse_date(f.name)) for f in subfolders]
    valid_folders = [f for f in folders_with_dates if f[1] is not None]
    
    if valid_folders:
        latest_folder = max(valid_folders, key=lambda x: x[1])[0]
        print(f"Latest experiment folder: {latest_folder.name}")
        return latest_folder
    else:
        print("⚠ No valid timestamped experiment folders found.")
        return None


# --- Main Execution ---

# Find the CSV file
log_file_path = find_csv_file()

if log_file_path and log_file_path.exists():
    # Load the CSV file
    df = pd.read_csv(log_file_path)
    
    # Sort by AUC in descending order
    df_sorted = df.sort_values(by='auc', ascending=False)
    
    print(f"\n✓ Successfully loaded {len(df_sorted)} records from the log file.")
    
    # Group by outcome_variable and get the best result for each
    top_results_by_outcome = df_sorted.groupby('outcome_variable').first().reset_index()
    
    print(f"\nTop results by outcome variable ({len(top_results_by_outcome)} outcomes):\n")
    
    # Display the result
    display(top_results_by_outcome)
    
else:
    print("\n✗ Error: Could not find 'final_grid_score_log.csv' in any expected location:")
    print(f"  - {(experiments_base_dir.parent / 'final_grid_score_log.csv').resolve()}")
    print(f"  - {(project_root / experiments_base_dir / 'final_grid_score_log.csv').resolve()}")
    print(f"  - In any timestamped subfolder within {experiments_base_dir.resolve()}")

In [None]:
import pandas as pd
import numpy as np

# Load the data
data_path = 'test_data_hfe_1yr_m_small_multiclass.csv'
data = pd.read_csv(data_path)

# Display basic information about the dataset
print("=== Dataset Information ===")
print(f"Shape of the dataset: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Check for missing values
print("\n=== Missing Values ===")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Check for constant features
print("\n=== Constant Features ===")
constant_features = [col for col in data.columns if data[col].nunique() == 1]
print(f"Constant features: {constant_features}")

# Check for features with very low variance (almost constant)
print("\n=== Low Variance Features ===")
low_variance_features = []
for col in data.columns:
    if data[col].dtype in [np.float64, np.int64]:  # Check only numeric features
        if data[col].std() < 0.01:  # Threshold for low variance
            low_variance_features.append(col)
print(f"Low variance features: {low_variance_features}")

# Check for duplicate rows
print("\n=== Duplicate Rows ===")
duplicate_rows = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Check for class distribution (if it's a classification problem)
if 'target' in data.columns:  # Replace 'target' with your actual target column name
    print("\n=== Class Distribution ===")
    print(data['target'].value_counts())

# Check for categorical features with high cardinality
print("\n=== High Cardinality Categorical Features ===")
categorical_features = data.select_dtypes(include=['object', 'category']).columns
high_cardinality_features = [col for col in categorical_features if data[col].nunique() > 100]
print(f"High cardinality categorical features: {high_cardinality_features}")

# Check for outliers in numeric features (using IQR)
print("\n=== Outliers in Numeric Features ===")
numeric_features = data.select_dtypes(include=[np.float64, np.int64]).columns
for col in numeric_features:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    if not outliers.empty:
        print(f"Outliers in {col}: {len(outliers)} rows")

# Summary of issues
print("\n=== Summary of Issues ===")
if missing_values.any():
    print(f"- Missing values found in {missing_values[missing_values > 0].index.tolist()}")
if constant_features:
    print(f"- Constant features found: {constant_features}")
if low_variance_features:
    print(f"- Low variance features found: {low_variance_features}")
if duplicate_rows > 0:
    print(f"- Duplicate rows found: {duplicate_rows}")
if high_cardinality_features:
    print(f"- High cardinality categorical features found: {high_cardinality_features}")
if not missing_values.any() and not constant_features and not low_variance_features and not duplicate_rows and not high_cardinality_features:
    print("- No major issues found in the dataset.")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is named 'df'

# Get the top result for each outcome_variable by AUC
top_auc_per_outcome = df.loc[df.groupby('outcome_variable')['auc'].idxmax()]

# Sort by AUC for better visualization
top_auc_per_outcome = top_auc_per_outcome.sort_values(by='auc', ascending=False)

# Set plot style
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Create barplot to show the top AUC for each outcome_variable
sns.barplot(
    x='auc', 
    y='outcome_variable', 
    data=top_auc_per_outcome, 
    hue='nb_size', 
    dodge=False, 
    palette='viridis'
)

# Add titles and labels
plt.title('Top AUC for Each Outcome Variable')
plt.xlabel('AUC')
plt.ylabel('Outcome Variable')
plt.legend(title='num features')

# Display the plot
plt.show()


In [None]:
print("done")

In [None]:
# Import the necessary classes
from ml_grid.results_processing.core import ResultsAggregator
from ml_grid.results_processing.plot_master import MasterPlotter
import pandas as pd

# 1. Load your data using the ResultsAggregator
#    Replace with the actual path to your results and feature names file.
#    The feature_names_csv is optional but required for feature-related plots.
try:
    aggregator = ResultsAggregator(
        root_folder=config['experiment']['experiments_base_dir'],
        feature_names_csv=config['data']['file_path'])
    results_df = aggregator.aggregate_all_runs()

    # 2. Instantiate the MasterPlotter with your data
    master_plotter = MasterPlotter(results_df)

    # 3. Call the plot_all() method to generate all visualizations
    #    You can customize the primary metric and other options.
    master_plotter.plot_all(metric='auc', stratify_by_outcome=True)

except (ValueError, FileNotFoundError) as e:
    print(f"An error occurred: {e}")
    print("Please ensure your results folder path is correct and contains valid run data.")



In [None]:
display(config)