In [None]:
"""
Minimal standalone script to instantiate and test the ml_grid.pipeline.data.pipe class.

This script provides a clear example of the minimum setup required to create
an `ml_grid_object`. It is intended for debugging the data pipeline in isolation.
"""

import shutil
import pandas as pd
from pathlib import Path
import os

# --- Essential imports from your ml_grid project ---
# Ensure your project is installed (e.g., `pip install -e .`) or the path is configured
# so these imports can be found.
try:
    from ml_grid.pipeline.data import pipe
    from ml_grid.util.global_params import global_parameters
    from ml_grid.util.create_experiment_directory import create_experiment_directory
except ImportError as e:
    print("Could not import ml_grid components.")
    print("Please ensure you are running this script from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    exit()


# =============================================================================
# 1. SETUP ENVIRONMENT & PATHS
# =============================================================================
print("1. Setting up environment and paths...")

# Assume the script is run from the project's root directory
project_root = Path().resolve()
base_project_dir = str(project_root)
print(f"  Project Root: {project_root}")

# Define the path to your input data.
# Using Path() ensures the variable is a Path object, not a string.
input_csv_path = Path("test_data_hfe_1yr_m_small_multiclass.csv")

# This check will now work correctly because input_csv_path is a Path object.
if not input_csv_path.exists():
    print(f"  ERROR: Data file not found at '{input_csv_path}'")
    print("  Please make sure the path is correct.")
    exit()
print(f"  Input CSV: {input_csv_path}")

# Create a directory for this specific experiment run's logs and outputs
experiments_base_dir = project_root / "experiments"
experiment_dir = create_experiment_directory(
    base_dir=experiments_base_dir,
    additional_naming="PipeDebug"
)
print(f"  Experiment Directory: {experiment_dir}")

# Configure global parameters (optional, but good practice)
global_parameters.verbose = 2  # Set to 0 for silent, 1 for info, 2 for debug
global_parameters.error_raise = True # Set to True to stop on errors


# =============================================================================
# 2. DEFINE PIPELINE PARAMETERS
# =============================================================================
print("\n2. Defining pipeline parameters...")

# --- Define which columns to drop based on substrings ---
drop_term_list = ['chrom', 'hfe', 'phlebo']
print(f"  Drop terms: {drop_term_list}")

# --- Define which models to make available for this run ---
# This dictionary toggles which model classes will be loaded.
# Using simpler names, as the loader will now handle the '_class' suffix.
model_class_dict = {
    'LogisticRegression': True,
    'RandomForestClassifier': True,
    'XGB_class': False, # Example of a disabled model
    # Add other models as needed, e.g., 'CatBoost', 'LightGBMClassifierWrapper'
}
print(f"  Enabled models: {[k for k, v in model_class_dict.items() if v]}")

# --- Define the outcome variable ---
# You can override the default outcome variable selection.
# Set to None to use the 'outcome_var_n' from local_param_dict.
outcome_var = 'outcome_var_1' # Example: explicitly use 'outcome_var_1'
print(f"  Outcome variable override: '{outcome_var}'")

# --- Define the core parameter dictionary for the pipeline ---
# This dictionary controls all the data processing steps.
local_param_dict = {
    'outcome_var_n': 1,  # Default outcome if override is not used (outcome_var_1)
    'param_space_size': 'xsmall', # Explicitly set the parameter space size
    'scale': True,
    'feature_n': 90,     # Select top 80% of features by importance
    'use_embedding': False,
    'embedding_method': 'pca',
    'embedding_dim': 10,
    'scale_features_before_embedding': True,
    'percent_missing': 98,
    'correlation_threshold': 0.95,
    'test_size': 0.2,
    'random_state': 42,
    # The 'data' sub-dictionary toggles feature groups on and off.
    # These keys correspond to the logic in `get_pertubation_columns`.
    'data': {
        'age': True,
        'sex': True,
        'bmi': True,
        'ethnicity': True,
        'bloods': True,
        'diagnostic_order': True, 
        'drug_order': True,
        'annotation_n': True,
        'meta_sp_annotation_n': True,
        'annotation_mrc_n': True,
        'meta_sp_annotation_mrc_n': True,
        'core_02': True,
        'bed': True,
        'vte_status': True,
        'hosp_site': True,
        'core_resus': True,
        'news': True,
        'date_time_stamp': False,
        'appointments': False,
    }
}
print("  Local parameter dictionary configured.")

# A unique index for this parameter combination (useful when iterating)
param_space_index = 0


# =============================================================================
# 3. INSTANTIATE THE PIPE CLASS
# =============================================================================
print("\n3. Instantiating the 'pipe' class...")

try:
    # This is the call to create the ml_grid_object.
    # The entire data pipeline runs during this initialization.
    ml_grid_object = pipe(
        file_name=str(input_csv_path),
        drop_term_list=drop_term_list,
        local_param_dict=local_param_dict,
        base_project_dir=base_project_dir,
        experiment_dir=experiment_dir,
        test_sample_n=0,  # Use 0 to process the full dataset
        param_space_index=param_space_index,
        model_class_dict=model_class_dict,
        outcome_var_override=outcome_var
    )

    print("\n" + "="*50)
    print("SUCCESS: 'ml_grid_object' created successfully.")
    print("="*50)

    # =========================================================================
    # 4. INSPECT THE RESULTS
    # =========================================================================
    print("\n4. Inspecting the final object attributes:")
    print(f"  - Outcome Variable Used: {ml_grid_object.outcome_variable}")
    print(f"  - X_train shape: {ml_grid_object.X_train.shape}")
    print(f"  - y_train shape: {ml_grid_object.y_train.shape}")
    print(f"  - X_test shape: {ml_grid_object.X_test.shape}")
    print(f"  - y_test shape: {ml_grid_object.y_test.shape}")
    print(f"  - Number of final features: {len(ml_grid_object.final_column_list)}")
    print(f"  - Number of available models: {len(ml_grid_object.model_class_list)}")
    print("\nFinal X_train columns sample:")
    print(ml_grid_object.X_train.columns.to_list()[:10])


except Exception as e:
    print("\n" + "="*50)
    print("ERROR: Failed to instantiate the 'pipe' class.")
    print("="*50)
    import traceback
    print(f"\nAn error of type '{type(e).__name__}' occurred: {e}")
    print("\nFull Traceback:")
    traceback.print_exc()
    



In [None]:
ml_grid_object.X_train

In [None]:
ml_grid_object.X_test

In [None]:
import traceback
from typing import Any, Dict, List, Tuple

# =============================================================================
# 4. INITIALIZE AND EXECUTE THE MODEL TRAINING RUN
# =============================================================================
print("\n4. Initializing and executing the model training run...")

# --- Essential imports ---
# Make sure your project is installed or the path is configured.
try:
    from ml_grid.pipeline.main import run
    from ml_grid.util.global_params import global_parameters
except ImportError as e:
    print("Could not import ml_grid components for the 'run' step.")
    print("Please ensure you are running this from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    # Use exit() if running as a script, or just let the error show in a notebook.
    # exit() 

# Check if ml_grid_object exists from the previous cell's execution
if 'ml_grid_object' not in locals() or 'local_param_dict' not in locals():
    print("\nERROR: 'ml_grid_object' or 'local_param_dict' not found.")
    print("Please ensure you have successfully run the previous cell (the 'pipe' instantiation script) first.")
else:
    try:
        print("  Instantiating the 'run' class with the data object...")
        # Instantiate the 'run' class with the object from the data pipeline
        run_instance = run(
            ml_grid_object=ml_grid_object,
            local_param_dict=local_param_dict
        )

        print("  Executing the hyperparameter search and cross-validation...")
        # Execute the hyperparameter search and cross-validation for all models
        model_errors, highest_score = run_instance.execute()

        print("\n" + "="*50)
        print("SUCCESS: Model training and evaluation complete.")
        print("="*50)

        # =====================================================================
        # 5. INSPECT THE TRAINING RESULTS
        # =====================================================================
        print("\n5. Inspecting the training results:")
        print(f"  - Highest score achieved across all models: {highest_score:.4f}")

        if model_errors:
            print(f"\n  - {len(model_errors)} model(s) encountered errors during training:")
            for i, error_info in enumerate(model_errors):
                try:
                    # Try to get a meaningful name for the model
                    model_name = error_info[0].__class__.__name__
                except:
                    model_name = "Unknown Model"
                error_exception = error_info[1]
                print(f"    {i+1}. Model: {model_name}, Error: {error_exception}")
        else:
            print("\n  - All configured models ran without critical errors.")

    except Exception as e:
        print("\n" + "="*50)
        print("ERROR: An unexpected error occurred during the model training run.")
        print("="*50)
        print(f"\nAn error of type '{type(e).__name__}' occurred: {e}")
        print("\nFull Traceback:")
        traceback.print_exc()



In [None]:
import os
import shutil
import time
import traceback
import warnings
from typing import Any, Dict, List, Union
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import clear_output, display
from numpy import absolute, mean, std
from pandas.testing import assert_index_equal
from sklearn import metrics
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    GridSearchCV,
    ParameterGrid,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_validate,
)
from sklearn.preprocessing import MinMaxScaler
from xgboost.core import XGBoostError

# --- Essential imports from your ml_grid project ---
# Ensure your project is installed or the path is configured.
try:
    from ml_grid.pipeline.data import pipe
    from ml_grid.pipeline.hyperparameter_search import HyperparameterSearch
    from ml_grid.util.bayes_utils import calculate_combinations
    from ml_grid.util.create_experiment_directory import create_experiment_directory
    from ml_grid.util.global_params import global_parameters
    from ml_grid.util.project_score_save import project_score_save_class
    from ml_grid.util.validate_parameters import validate_parameters_helper
except ImportError as e:
    print("Could not import ml_grid components.")
    print("Please ensure you are running this from the project root directory,")
    print("and that the project has been installed (e.g., using 'pip install -e .').")
    print(f"Error: {e}")
    # Use exit() if running as a script, or just let the error show in a notebook.
    # exit()

# =============================================================================
# STANDALONE SCRIPT TO DEBUG `grid_search_crossvalidate` INTERNALS
# =============================================================================

# --- 1. Setup Environment & Paths ---
print("1. Setting up environment and paths...")
# Assume the script is run from the project's root directory
project_root = Path().resolve()
base_project_dir = str(project_root)
print(f"   Project Root: {project_root}")

# --- Define the path to your input data ---
# IMPORTANT: Update this path to the correct location of your file.
input_csv_path = Path("/home/samorah/_data/ml_binary_classification_gridsearch_hyperOpt/notebooks/test_data_hfe_1yr_m_small_multiclass.csv")

if not input_csv_path.exists():
    print(f"   ERROR: Data file not found at '{input_csv_path}'")
    print("   Please make sure the path is correct.")
    # exit()
else:
    print(f"   Input CSV: {input_csv_path}")

# Create a directory for this specific experiment run's logs and outputs
experiments_base_dir = project_root / "experiments"
experiment_dir = create_experiment_directory(
    base_dir=experiments_base_dir,
    additional_naming="GSCV_Internals_Debug"
)
print(f"   Experiment Directory: {experiment_dir}")


# --- 2. Configure Parameters ---
print("\n2. Configuring parameters...")
# Global parameters
global_parameters.verbose = 1
global_parameters.error_raise = False
global_parameters.bayessearch = False
global_parameters.random_grid_search = True
global_parameters.sub_sample_param_space_pct = 0.2

# Local parameters for the data pipeline, configured for your dataset
local_param_dict = {
    'outcome_var_n': 1,
    'param_space_size': 'xsmall',
    'scale': True,
    'feature_n': 90,
    'use_embedding': False,
    'percent_missing': 98,
    'correlation_threshold': 0.95,
    'test_size': 0.2,
    'random_state': 42,
    'data': {
        'age': True, 'sex': True, 'bmi': True, 'ethnicity': True,
        'bloods': True, 'diagnostic_order': True, 'drug_order': True,
        'annotation_n': True, 'meta_sp_annotation_n': True,
        'annotation_mrc_n': True, 'meta_sp_annotation_mrc_n': True,
        'core_02': True, 'bed': True, 'vte_status': True,
        'hosp_site': True, 'core_resus': True, 'news': True,
        'date_time_stamp': False, 'appointments': False,
    }
}
print("   Parameters configured.")

# --- Main Execution Block ---
ml_grid_object = None
try:
    # --- 3. Run Data Pipeline to Get `ml_grid_object` ---
    print("\n3. Initializing data pipeline (`pipe`) to prepare data...")
    ml_grid_object = pipe(
        file_name=str(input_csv_path),
        drop_term_list=['chrom', 'hfe', 'phlebo'],
        experiment_dir=str(experiment_dir),
        base_project_dir=base_project_dir,
        local_param_dict=local_param_dict,
        param_space_index=0,
        model_class_dict={'RandomForestClassifier': True},
        outcome_var_override='outcome_var_1'
    )
    print("   Data pipeline finished.")

    # =========================================================================
    # --- 4. EXPLICIT `grid_search_crossvalidate` INTERNAL LOGIC ---
    # =========================================================================
    print("\n4. Executing `grid_search_crossvalidate` internal logic...")
    start_time_main = time.time()

    # --- 4a. Select a model and extract its properties ---
    model_to_test = ml_grid_object.model_class_list[0]
    algorithm_implementation = model_to_test.algorithm_implementation
    parameter_space = model_to_test.parameter_space
    method_name = model_to_test.method_name
    print(f"   - Model for debugging: {method_name}")

    # --- 4b. Initialize variables from `grid_search_crossvalidate.__init__` ---
    # CORRECTED: Set each warning filter individually.
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    
    grid_n_jobs = global_parameters.grid_n_jobs
    if "keras" in method_name.lower() or "xgb" in method_name.lower() or "catboost" in method_name.lower():
        grid_n_jobs = 1
        try:
            gpu_devices = tf.config.experimental.list_physical_devices("GPU")
            for device in gpu_devices:
                tf.config.experimental.set_memory_growth(device, True)
        except Exception as e:
            print(f"   - Could not configure GPU for TensorFlow: {e}")

    # Extract data from the ml_grid_object
    X_train, y_train = ml_grid_object.X_train, ml_grid_object.y_train
    X_test, y_test = ml_grid_object.X_test, ml_grid_object.y_test

    # --- 4c. Prepare for Hyperparameter Search ---
    max_param_space_iter_value = global_parameters.max_param_space_iter_value
    param_grid_size = len(ParameterGrid(parameter_space))
    sub_sample_parameter_val = int(global_parameters.sub_sample_param_space_pct * param_grid_size)
    n_iter_v = max(2, sub_sample_parameter_val)
    n_iter_v = min(n_iter_v, max_param_space_iter_value)
    print(f"   - Hyperparameter search iterations (n_iter): {n_iter_v}")

    # Instantiate the HyperparameterSearch class
    search = HyperparameterSearch(
        algorithm=algorithm_implementation,
        parameter_space=parameter_space,
        method_name=method_name,
        global_params=global_parameters,
        max_iter=n_iter_v,
        ml_grid_object=ml_grid_object
    )

    # --- 4d. Run the Hyperparameter Search ---
    print("   - Running HyperparameterSearch.run_search()...")
    # This is the core search step (e.g., RandomizedSearchCV.fit())
    best_estimator = search.run_search(X_train, y_train)
    print(f"   - Best estimator found: {best_estimator.get_params()}")

    # --- 4e. Fit the Final Model and Evaluate ---
    print("   - Fitting the best estimator on the full training data...")
    # Use numpy arrays for final fitting
    best_estimator.fit(X_train.values, y_train.values)

    # --- 4f. Run Cross-Validation on the Best Model ---
    print("   - Running cross_validate on the best estimator...")
    cv_splitter = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)
    
    try:
        scores = cross_validate(
            best_estimator,
            X_train.values,
            y_train.values,
            scoring=global_parameters.metric_list,
            cv=cv_splitter,
            n_jobs=grid_n_jobs,
            error_score='raise'
        )
        failed = False
        print("   - Cross-validation successful.")
        for metric, values in scores.items():
            print(f"     - {metric}: {np.mean(values):.4f}")
            
    except Exception as e:
        print(f"   - Cross-validation failed: {e}")
        scores = {metric: [0.5] for metric in global_parameters.metric_list} # Default scores
        failed = True

    # --- 4g. Predict on the Test Set and Calculate Final Score ---
    print("   - Predicting on the test set...")
    # Use .values to ensure numpy array for prediction
    best_pred_orig = best_estimator.predict(X_test.values)
    
    # The final score to be optimized/reported
    final_auc_score = roc_auc_score(y_test, best_pred_orig)
    
    # --- 4h. Log the results (emulating project_score_save_class) ---
    project_score_save_class.update_score_log(
        ml_grid_object=ml_grid_object,
        scores=scores,
        best_pred_orig=best_pred_orig,
        current_algorithm=best_estimator,
        method_name=method_name,
        pg=param_grid_size,
        start=start_time_main,
        n_iter_v=n_iter_v,
        failed=failed
    )
    print("   - Results logged.")

    # =========================================================================
    # --- 5. Display the Final Results ---
    # =========================================================================
    print("\n" + "="*60)
    print("SUCCESS: Standalone internal logic run complete.")
    print("="*60)
    print(f"\n   - Model Tested: {method_name}")
    print(f"   - Final Reported AUC Score on Test Set: {final_auc_score:.4f}")
    print(f"   - Total execution time: {time.time() - start_time_main:.2f} seconds")

except Exception as e:
    print("\n" + "="*60)
    print("A CRITICAL ERROR OCCURRED DURING EXECUTION")
    print("="*60)
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {e}")
    print("\nFull Traceback:")
    traceback.print_exc()

finally:
    # --- 6. Cleanup ---
    try:
        os.remove('final_grid_score_log.csv')
    
    except Exception as e:
        print(e)

    try:
        shutil.rmtree('experiments')
    except Exception as e:
        print(e)
        
        
    try: 
        shutil.rmtree('run_0')
    except Exception as e:
        print(e)
    print("\n6.  clean up.")
    
    


In [2]:
!rm -r HFE_ML_experiments

In [None]:
import os
import subprocess
import warnings
import sys
import pandas as pd
from hyperopt import fmin, tpe, hp
from hyperopt.pyll import scope

# Suppress TensorFlow GPU-related warnings
# 0 = all messages are logged (default)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
warnings.filterwarnings('ignore') 
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# Boolean flag to control CPU core binding
limit_cpu_cores = False

if limit_cpu_cores:
    # Get the current process ID
    pid = os.getpid()
    print(f"Notebook PID: {pid}")

    # Define the CPU cores to bind (e.g., cores 0-3)
    core_range = "0-3"

    # Use taskset to bind the current process to specific CPU cores
    try:
        # Execute taskset command
        subprocess.run(["taskset", "-cp", core_range, str(pid)], check=True)
        print(f"Successfully bound Notebook PID {pid} to CPU cores {core_range}.")
    except FileNotFoundError:
        print("Error: 'taskset' command not found. Please ensure it is installed.")
    except subprocess.CalledProcessError as e:
        print(f"Error while setting CPU affinity: {e}")
else:
    print("CPU core binding is disabled.")


In [None]:
from ml_grid.util import grid_param_space

In [None]:
import ipywidgets as ipw
output = ipw.Output()

In [None]:
grid = {
            
            'resample' : ['undersample', 'oversample', None],
            'scale'    : [True, False],
            'feature_n': [100, 95, 75, 50, 25, 5],
            'param_space_size':['medium', 'xsmall'],
            'n_unique_out': [10],
            'outcome_var_n':['1'],
                            'percent_missing':[99, 95, 80],  #n/100 ex 95 for 95% # 99.99, 99.5, 9
                            'corr':[0.98, 0.85, 0.5, 0.25],
                            'data':[{'age':[True, False],
                                    'sex':[True, False],
                                    'bmi':[True],
                                    'ethnicity':[True, False],
                                    'bloods':[True, False],
                                    'diagnostic_order':[True, False],
                                    'drug_order':[True, False],
                                    'annotation_n':[True, False],
                                    'meta_sp_annotation_n':[True, False],
                                    'annotation_mrc_n':[True, False],
                                    'meta_sp_annotation_mrc_n':[True, False],
                                    'core_02':[False],
                                    'bed':[False],
                                    'vte_status':[True],
                                    'hosp_site':[True],
                                    'core_resus':[False],
                                    'news':[False],
                                    'date_time_stamp':[ False]
                                    
                                    }]
        }

In [None]:


space = {
    'resample': hp.choice('resample', ['undersample', 'oversample', None]),
    'scale': hp.choice('scale', [True, False]),
    'feature_n': hp.choice('feature_n', [100, 95, 75, 50, 25, 5]),
    'param_space_size': hp.choice('param_space_size', ['medium', 'xsmall']),
    'n_unique_out': hp.choice('n_unique_out', [10]),
    'outcome_var_n': hp.choice('outcome_var_n', ['1']),
    'percent_missing': hp.choice('percent_missing', [99, 95, 80]),
    'corr': hp.choice('corr', [0.98, 0.85, 0.5, 0.25]),
    'feature_selection_method': hp.choice('feature_selection_method', ['anova', 'markov_blanket']),

    # Embedding hyperparameters
    'use_embedding': hp.choice('use_embedding', [True, False]),
    'embedding_method': hp.choice('embedding_method', ['pca', 'svd']),
    'embedding_dim': hp.choice('embedding_dim', [32, 64, 128]),
    'scale_features_before_embedding': hp.choice('scale_features_before_embedding', [True, False]),
    'cache_embeddings': hp.choice('cache_embeddings', [False]), 
    
    'data': {
        'age': hp.choice('age', [True, False]),
        'sex': hp.choice('sex', [True, False]),
        'bmi': hp.choice('bmi', [True]),
        'ethnicity': hp.choice('ethnicity', [True, False]),
        'bloods': hp.choice('bloods', [True, False]),
        'diagnostic_order': hp.choice('diagnostic_order', [True, False]),
        'drug_order': hp.choice('drug_order', [True, False]),
        'annotation_n': hp.choice('annotation_n', [True, False]),
        'meta_sp_annotation_n': hp.choice('meta_sp_annotation_n', [True, False]),
        'annotation_mrc_n': hp.choice('annotation_mrc_n', [True, False]),
        'meta_sp_annotation_mrc_n': hp.choice('meta_sp_annotation_mrc_n', [True, False]),
        'core_02': hp.choice('core_02', [False]),
        'bed': hp.choice('bed', [False]),
        'vte_status': hp.choice('vte_status', [True]),
        'hosp_site': hp.choice('hosp_site', [True]),
        'core_resus': hp.choice('core_resus', [False]),
        'news': hp.choice('news', [False]),
        'date_time_stamp': hp.choice('date_time_stamp', [False]),
        'appointments': hp.choice('appointments', [False])
    }
}


In [None]:
# debug 



space = {
    'resample': hp.choice('resample', ['undersample', 'oversample', None]),
    'scale': hp.choice('scale', [True, False]),
    'feature_n': hp.choice('feature_n', [100, 95, 75, 50, 25, 5]),
    'param_space_size': hp.choice('param_space_size', ['medium', 'xsmall']),
    'n_unique_out': hp.choice('n_unique_out', [10]),
    'outcome_var_n': hp.choice('outcome_var_n', ['1']),
    'percent_missing': hp.choice('percent_missing', [99, 95, 80]),
    'corr': hp.choice('corr', [0.98, 0.85, 0.5, 0.25]),
    'feature_selection_method': hp.choice('feature_selection_method', ['anova']),

    # Embedding hyperparameters
    'use_embedding': hp.choice('use_embedding', [ False]),
    'embedding_method': hp.choice('embedding_method', ['pca', 'svd']),
    'embedding_dim': hp.choice('embedding_dim', [32, 64, 128]),
    'scale_features_before_embedding': hp.choice('scale_features_before_embedding', [ False]),
    'cache_embeddings': hp.choice('cache_embeddings', [False]), 
    
    'data': {
        'age': hp.choice('age', [True, False]),
        'sex': hp.choice('sex', [True, False]),
        'bmi': hp.choice('bmi', [True]),
        'ethnicity': hp.choice('ethnicity', [True]),
        'bloods': hp.choice('bloods', [True]),
        'diagnostic_order': hp.choice('diagnostic_order', [True]),
        'drug_order': hp.choice('drug_order', [True]),
        'annotation_n': hp.choice('annotation_n', [True]),
        'meta_sp_annotation_n': hp.choice('meta_sp_annotation_n', [True]),
        'annotation_mrc_n': hp.choice('annotation_mrc_n', [True]),
        'meta_sp_annotation_mrc_n': hp.choice('meta_sp_annotation_mrc_n', [True]),
        'core_02': hp.choice('core_02', [False]),
        'bed': hp.choice('bed', [False]),
        'vte_status': hp.choice('vte_status', [True]),
        'hosp_site': hp.choice('hosp_site', [True]),
        'core_resus': hp.choice('core_resus', [False]),
        'news': hp.choice('news', [False]),
        'date_time_stamp': hp.choice('date_time_stamp', [False]),
        'appointments': hp.choice('appointments', [False])
    }
}


In [None]:
# Breast cancer sample space:

space_breast_cancer = {
    'resample': hp.choice('resample', ['undersample', 'oversample', None]),
    'scale': hp.choice('scale', [True, False]),
    'feature_n': hp.choice('feature_n', [ 25, 5]),
    'param_space_size': hp.choice('param_space_size', ['medium', 'xsmall']),
    'n_unique_out': hp.choice('n_unique_out', [10]),
    'outcome_var_n': hp.choice('outcome_var_n', ['1']), # Optimise for alternate representations of outcome variable.
    'percent_missing': hp.choice('percent_missing', [99, 95, 80]),
    'corr': hp.choice('corr', [0.98, 0.85, 0.5, 0.25]),
    'data': {
        'age': hp.choice('age', [False]),
        'sex': hp.choice('sex', [ False]),
        'bmi': hp.choice('bmi', [False]),
        'ethnicity': hp.choice('ethnicity', [ False]),
        'bloods': hp.choice('bloods', [True, ]),
        'diagnostic_order': hp.choice('diagnostic_order', [ False]),
        'drug_order': hp.choice('drug_order', [ False]),
        'annotation_n': hp.choice('annotation_n', [ False]),
        'meta_sp_annotation_n': hp.choice('meta_sp_annotation_n', [ False]),
        'annotation_mrc_n': hp.choice('annotation_mrc_n', [ False]),
        'meta_sp_annotation_mrc_n': hp.choice('meta_sp_annotation_mrc_n', [ False]),
        'core_02': hp.choice('core_02', [False]),
        'bed': hp.choice('bed', [False]),
        'vte_status': hp.choice('vte_status', [False]),
        'hosp_site': hp.choice('hosp_site', [False]),
        'core_resus': hp.choice('core_resus', [False]),
        'news': hp.choice('news', [False]),
        'date_time_stamp': hp.choice('date_time_stamp', [False]),
    }
}

In [None]:
# Optionally exclude model classes

model_class_dict = {
        "LogisticRegression_class": True,
        "knn_classifiers_class": True,
        "quadratic_discriminant_analysis_class": True,
        "SVC_class": True,
        "XGB_class_class": True,
        "mlp_classifier_class": True,
        "RandomForestClassifier_class": True,
        "GradientBoostingClassifier_class": True,
        "CatBoost_class": True,
        "GaussianNB_class": True,
        "LightGBMClassifierWrapper": True,
        "adaboost_class": True,
        "kerasClassifier_class": True,
        "knn__gpu_wrapper_class": True,
        "NeuralNetworkClassifier_class": True,
        "TabTransformer_class": False,
    }

In [None]:
import ml_grid
from pathlib import Path
import datetime
from tqdm import tqdm
import random
from IPython.display import clear_output
import pandas as pd
from hyperopt import STATUS_FAIL
from ml_grid.model_classes.h2o_classifier_class import h2o_classifier_class
#from ml_grid.util import create_experiment_directory

from ml_grid.util.create_experiment_directory import create_experiment_directory
from ml_grid.util.project_score_save import project_score_save_class

from ml_grid.pipeline.data import NoFeaturesError, pipe

from ml_grid.util.param_space import ParamSpace

random.seed(1234)

multiple_outcomes_example = True

# Define a base directory and a descriptive name for this experiment batch
base_project_dir = 'HFE_ML_experiments'
experiment_name = "HFE_ML_Grid"
n_iter = 5 # Number of parameter combinations to test

# --- Setup ---
experiment_dir = create_experiment_directory(
    base_dir=base_project_dir,
    additional_naming=experiment_name
)

if(multiple_outcomes_example == False):
    input_csv_path = 'breast_cancer_dataset.csv'

else:
    input_csv_path = 'test_data_hfe_1yr_m_small_multiclass.csv'

#input_csv_path = os.path.join('..', 'gloabl_files', 'ml_binary_classification_gridsearch_hyperOpt', 'notebooks' ,'test_data_hfe_1yr_m_small.csv') #large

#init csv to store each local projects results

project_score_save_class(base_project_dir)

grid_iter_obj = grid_param_space.Grid(sample_n=n_iter).settings_list_iterator


def objective(local_param_dict, outcome_var=None):
    clear_output()
    #get settings from iterator over grid of settings space
    #local_param_dict = next(grid_iter_obj)
    print(local_param_dict)
    
    #init random number string
    
    idx = random.randint(0,999999999999999999999)

    try:
        #create object from settings
        ml_grid_object = pipe(input_csv_path,
                                                    drop_term_list=['chrom', 'hfe', 'phlebo'],
                                                    local_param_dict=local_param_dict,
                                                    base_project_dir = base_project_dir,
                                                    experiment_dir=experiment_dir,  
                                                    test_sample_n = 0,
                                                    param_space_index = idx,
                                                    model_class_dict = model_class_dict,
                                                    outcome_var_override = outcome_var
                                                    #outcome_var_override = None #override outcome var, example = 'outcome_var_myeloma'
                                                    #outcome_var_override = outcome_var_list[outcome_index] # set if multi class ##deprecated
                                                    )

        from ml_grid.pipeline import main
        
        
        # from ml_grid.model_classes.h2o_classifier_class import h2o_classifier_class

        # Example overwrite/append model_class list
        # temp_param_space_size = ParamSpace(ml_grid_object.local_param_dict.get("param_space_size"))

        # ml_grid_object.model_class_list = [h2o_classifier_class(
        #             X=ml_grid_object.X_train,
        #             y=ml_grid_object.y_train,
        #             parameter_space_size=temp_param_space_size,
        #         )]

        # Example append 
        # if(ml_grid_object.time_series_mode == False):
        #temp_param_space_size = ParamSpace(ml_grid_object.local_param_dict.get("param_space_size"))

        #     ml_grid_object.model_class_list.extend([h2o_classifier_class(
        #                 X=ml_grid_object.X_train,
        #                 y=ml_grid_object.y_train,
        #                 parameter_space_size=temp_param_space_size,
        #             )])

        #pass object to be evaluated and write results to csv
        errors, highest_score = main.run(ml_grid_object, local_param_dict=local_param_dict).execute()
        
        log_file_path = Path(base_project_dir) / 'final_grid_score_log.csv'
        results_df = pd.read_csv(log_file_path)
        
        #highest_metric_from_run = results_df[results_df['i'] == str(idx)].sort_values(by='auc')['auc'].iloc[-1]
        
        highest_metric_from_run = highest_score # for hyperopt multi procesess #AUC
        
        #display(results_df[results_df['i'] == str(idx)].sort_values(by='auc').iloc[0])
        
        result = {
            "loss": 1-float(highest_metric_from_run),
            "status": "ok"  # Indicate that the evaluation was successful
        }
    except NoFeaturesError as e:
        print(f"Skipping trial due to NoFeaturesError: {e}")
        
        print("indicies..:")
        print(ml_grid_object.X_train.index)
        print(ml_grid_object.y_train.index)
        # Return a failure status to hyperopt
        return {'status': STATUS_FAIL, 'loss': float('inf')}
    
    
    return result
    
     

In [None]:
from ml_grid.util.global_params import global_parameters

# print all attributes and their values
print(vars(global_parameters))

if global_parameters.debug_level > 1:
        print("Debug Mode: Additional logging enabled.")

# Update global parameters
#global_parameters.update_parameters(debug_level=0, grid_n_jobs = -1, error_raise = True, max_param_space_iter_value=1 )

In [None]:
from ml_grid.util.global_params import global_parameters

#print all attributes and their values

print(vars(global_parameters))

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
trials = Trials()

In [None]:
from functools import partial

In [None]:
#%%prun
if( multiple_outcomes_example == False):

    # Fix the additional argument (outcome_var) using partial
    outcome_var = 'outcome_var_1'  # Define your outcome_var
    objective_with_outcome = partial(objective, outcome_var=outcome_var)

    # Initialize Trials object to store results
    trials = Trials()

    # Run the optimization
    best = fmin(
        fn=objective_with_outcome,  # Use the partial function
        space=space,
        algo=tpe.suggest,
        max_evals=1,
        trials=trials,
        verbose=1
    )

    print("Best hyperparameters:", best)

In [None]:
if( multiple_outcomes_example == False):
    best

In [None]:
if( multiple_outcomes_example == False):
    results_df = pd.read_csv(os.path.join(base_project_dir, 'final_grid_score_log.csv'))

In [None]:
if( multiple_outcomes_example == False):
    results_df.sort_values('auc', ascending=False).iloc[0]

In [None]:
if( multiple_outcomes_example == False):
    results_df.sort_values('auc', ascending=False)

In [None]:
if( multiple_outcomes_example == True):
    
    dft = pd.read_csv('test_data_hfe_1yr_m_small_multiclass.csv', nrows=1)
    dft

In [None]:
# get outcome variables by finding prefix "outcome_var_" in column list

if( multiple_outcomes_example == True):
    outcome_var_list = [dft.columns[i] for i in range(len(dft.columns)) if "outcome_var_" in dft.columns[i]]

    outcome_var_list

In [None]:
# Multiple outcomes one vs rest

In [None]:
outcome_var_list

In [None]:
#%%prun
if multiple_outcomes_example == True:
    
    import multiprocessing
    from datetime import datetime
    from hyperopt import fmin, tpe, Trials
    from joblib import Parallel, delayed
    from joblib import parallel_backend
    import traceback
    import sys

    # Get number of cores
    num_cores = multiprocessing.cpu_count()

    def process_single_outcome(outcome_index, outcome_var_list):
        """Process a single outcome index using multiprocessing."""
        outcome_var = outcome_var_list[outcome_index]
        start_time = datetime.now()
        print(f"[{start_time}] Starting outcome {outcome_index}: {outcome_var}")

        # Wrap objective to include the outcome_var
        def objective_with_outcome(params):
            print(f"Evaluating params: {params} for outcome {outcome_var}")
            return objective(params, outcome_var)

        try:
            # Use joblib's multiprocessing backend for scikit-learn operations
            with parallel_backend('multiprocessing', n_jobs=1):
                best = fmin(
                    fn=objective_with_outcome,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=1,
                    trials=Trials(),
                    verbose=0
                )
            end_time = datetime.now()
            print(f"[{end_time}] Finished outcome {outcome_index} (Duration: {end_time - start_time})")
            return outcome_index, best, None, None
       
        except Exception as e:
            # Capture full traceback
            tb_str = traceback.format_exc()
            print(f"Error in outcome {outcome_var}:\n{tb_str}", file=sys.stderr)
            return outcome_index, None, str(e), tb_str

    # Main execution
    if __name__ == "__main__":
        start_total = datetime.now()
        print(f"Starting all optimizations at {start_total}")

        # Use joblib's Parallel with verbose for better error reporting
        # Setting verbose=10 will show more details about what's happening
        results = Parallel(n_jobs=num_cores, verbose=10)(
            delayed(process_single_outcome)(i, outcome_var_list)
            for i in range(len(outcome_var_list))
        )

        # Process results and re-raise first error if any
        first_error = None
        for outcome_index, best, error, traceback_str in results:
            if error:
                print(f"\n{'='*70}")
                print(f"Exception on fmin for {outcome_var_list[outcome_index]}:")
                print(f"{'='*70}")
                print(traceback_str)
                if first_error is None:
                    first_error = (outcome_var_list[outcome_index], error, traceback_str)
            elif best is not None:
                print(f"Best parameters for {outcome_var_list[outcome_index]}: {best}")
            else:
                print(f"No result for {outcome_var_list[outcome_index]}")

        end_total = datetime.now()
        print(f"\nCompleted all optimizations at {end_total}")
        print(f"Total duration: {end_total - start_total}")
        
        # Re-raise the first error to get a proper stack trace
        if first_error:
            outcome_name, error_msg, tb_str = first_error
            raise RuntimeError(
                f"Error occurred in outcome '{outcome_name}': {error_msg}\n\n"
                f"Original traceback:\n{tb_str}"
            )

In [None]:
import pandas as pd 
import os
from datetime import datetime

# Define the parent directory
parent_dir = 'HFE_ML_experiments'

# Check if the CSV is directly in the parent directory first
csv_path = os.path.join(parent_dir, 'final_grid_score_log.csv')

if not os.path.exists(csv_path):
    # If not found directly, look for it in timestamped subfolders
    # List all folders in the parent directory that match the date pattern
    folders = [f for f in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, f))]

    # Parse folder names as dates and find the latest one
    def parse_date(folder_name: str):
        """
        Parses the timestamp from the beginning of a folder name.
        Expected format: 'YYYY-MM-DD_HH-MM-SS_...'.
        """
        try:
            # The timestamp is always the first 19 characters.
            timestamp_part = folder_name[:19]
            return datetime.strptime(timestamp_part, '%Y-%m-%d_%H-%M-%S')
        except (ValueError, IndexError):
            # Return None if the folder name doesn't match the expected format or is too short.
            return None

    # Filter and sort folders by date
    folders_with_dates = [(f, parse_date(f)) for f in folders]
    folders_with_dates = [f for f in folders_with_dates if f[1] is not None]
    
    if folders_with_dates:
        latest_folder = max(folders_with_dates, key=lambda x: x[1])[0]  # Get the folder with the latest date
        print("latest_folder", latest_folder)
        
        # Construct the path to the CSV file in the latest folder
        csv_path = os.path.join(parent_dir, latest_folder, 'final_grid_score_log.csv')
    else:
        raise FileNotFoundError("No timestamped folders found and CSV not in parent directory")
else:
    print("CSV found directly in parent directory")

# Load the CSV file
df = pd.read_csv(csv_path)

# Sort the DataFrame by 'auc' column in descending order
df = df.sort_values(by='auc', ascending=False)

print(f"Total rows: {len(df)}")

# Group by outcome_variable and display the first row of each group with the highest auc
df_best = df.groupby('outcome_variable').apply(lambda x: x.iloc[0])

# Display the result
df_best.head()

In [None]:
import pandas as pd
from datetime import datetime
from pathlib import Path

# --- Configuration ---
experiments_base_dir = Path('HFE_ML_experiments')

# --- Find the CSV file (try multiple locations) ---

def find_csv_file():
    """
    Search for final_grid_score_log.csv in multiple locations:
    1. Project root (parent of experiments_base_dir)
    2. Directly in experiments_base_dir
    3. In the latest timestamped subfolder
    """
    # Location 1: Project root
    project_root = experiments_base_dir.parent
    csv_path = project_root / 'final_grid_score_log.csv'
    if csv_path.exists():
        print(f"✓ CSV found in project root: {csv_path.resolve()}")
        return csv_path
    
    # Location 2: Directly in experiments directory
    csv_path = experiments_base_dir / 'final_grid_score_log.csv'
    if csv_path.exists():
        print(f"✓ CSV found in experiments directory: {csv_path.resolve()}")
        return csv_path
    
    # Location 3: In latest timestamped subfolder
    latest_folder = find_latest_experiment_folder()
    if latest_folder:
        csv_path = latest_folder / 'final_grid_score_log.csv'
        if csv_path.exists():
            print(f"✓ CSV found in latest experiment folder: {csv_path.resolve()}")
            return csv_path
    
    return None


def parse_date(folder_name: str):
    """
    Parses the timestamp from the beginning of a folder name.
    Expected format: 'YYYY-MM-DD_HH-MM-SS_...'.
    """
    try:
        timestamp_part = folder_name[:19]
        return datetime.strptime(timestamp_part, '%Y-%m-%d_%H-%M-%S')
    except (ValueError, IndexError):
        return None


def find_latest_experiment_folder():
    """Find the most recent timestamped experiment folder."""
    if not experiments_base_dir.exists() or not experiments_base_dir.is_dir():
        print(f"⚠ Experiments directory not found: {experiments_base_dir.resolve()}")
        return None
    
    subfolders = [f for f in experiments_base_dir.iterdir() if f.is_dir()]
    folders_with_dates = [(f, parse_date(f.name)) for f in subfolders]
    valid_folders = [f for f in folders_with_dates if f[1] is not None]
    
    if valid_folders:
        latest_folder = max(valid_folders, key=lambda x: x[1])[0]
        print(f"Latest experiment folder: {latest_folder.name}")
        return latest_folder
    else:
        print("⚠ No valid timestamped experiment folders found.")
        return None


# --- Main Execution ---

# Find the CSV file
log_file_path = find_csv_file()

if log_file_path:
    # Load the CSV file
    df = pd.read_csv(log_file_path)
    
    # Sort by AUC in descending order
    df_sorted = df.sort_values(by='auc', ascending=False)
    
    print(f"\n✓ Successfully loaded {len(df_sorted)} records from the log file.")
    
    # Group by outcome_variable and get the best result for each
    top_results_by_outcome = df_sorted.groupby('outcome_variable').first().reset_index()
    
    print(f"\nTop results by outcome variable ({len(top_results_by_outcome)} outcomes):\n")
    
    # Display the result
    display(top_results_by_outcome)
    
else:
    print("\n✗ Error: Could not find 'final_grid_score_log.csv' in any expected location:")
    print(f"  - {(experiments_base_dir.parent / 'final_grid_score_log.csv').resolve()}")
    print(f"  - {(experiments_base_dir / 'final_grid_score_log.csv').resolve()}")
    print(f"  - In any timestamped subfolder within {experiments_base_dir.resolve()}")

In [None]:
import pandas as pd
import numpy as np

# Load the data
data_path = 'test_data_hfe_1yr_m_small_multiclass.csv'
data = pd.read_csv(data_path)

# Display basic information about the dataset
print("=== Dataset Information ===")
print(f"Shape of the dataset: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Check for missing values
print("\n=== Missing Values ===")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Check for constant features
print("\n=== Constant Features ===")
constant_features = [col for col in data.columns if data[col].nunique() == 1]
print(f"Constant features: {constant_features}")

# Check for features with very low variance (almost constant)
print("\n=== Low Variance Features ===")
low_variance_features = []
for col in data.columns:
    if data[col].dtype in [np.float64, np.int64]:  # Check only numeric features
        if data[col].std() < 0.01:  # Threshold for low variance
            low_variance_features.append(col)
print(f"Low variance features: {low_variance_features}")

# Check for duplicate rows
print("\n=== Duplicate Rows ===")
duplicate_rows = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Check for class distribution (if it's a classification problem)
if 'target' in data.columns:  # Replace 'target' with your actual target column name
    print("\n=== Class Distribution ===")
    print(data['target'].value_counts())

# Check for categorical features with high cardinality
print("\n=== High Cardinality Categorical Features ===")
categorical_features = data.select_dtypes(include=['object', 'category']).columns
high_cardinality_features = [col for col in categorical_features if data[col].nunique() > 100]
print(f"High cardinality categorical features: {high_cardinality_features}")

# Check for outliers in numeric features (using IQR)
print("\n=== Outliers in Numeric Features ===")
numeric_features = data.select_dtypes(include=[np.float64, np.int64]).columns
for col in numeric_features:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    if not outliers.empty:
        print(f"Outliers in {col}: {len(outliers)} rows")

# Summary of issues
print("\n=== Summary of Issues ===")
if missing_values.any():
    print(f"- Missing values found in {missing_values[missing_values > 0].index.tolist()}")
if constant_features:
    print(f"- Constant features found: {constant_features}")
if low_variance_features:
    print(f"- Low variance features found: {low_variance_features}")
if duplicate_rows > 0:
    print(f"- Duplicate rows found: {duplicate_rows}")
if high_cardinality_features:
    print(f"- High cardinality categorical features found: {high_cardinality_features}")
if not missing_values.any() and not constant_features and not low_variance_features and not duplicate_rows and not high_cardinality_features:
    print("- No major issues found in the dataset.")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is named 'df'

# Get the top result for each outcome_variable by AUC
top_auc_per_outcome = df.loc[df.groupby('outcome_variable')['auc'].idxmax()]

# Sort by AUC for better visualization
top_auc_per_outcome = top_auc_per_outcome.sort_values(by='auc', ascending=False)

# Set plot style
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Create barplot to show the top AUC for each outcome_variable
sns.barplot(
    x='auc', 
    y='outcome_variable', 
    data=top_auc_per_outcome, 
    hue='nb_size', 
    dodge=False, 
    palette='viridis'
)

# Add titles and labels
plt.title('Top AUC for Each Outcome Variable')
plt.xlabel('AUC')
plt.ylabel('Outcome Variable')
plt.legend(title='num features')

# Display the plot
plt.show()


In [None]:
print("done")

In [None]:
# Import the necessary classes
from ml_grid.results_processing.core import ResultsAggregator
from ml_grid.results_processing.plot_master import MasterPlotter
import pandas as pd

# 1. Load your data using the ResultsAggregator
#    Replace with the actual path to your results and feature names file.
#    The feature_names_csv is optional but required for feature-related plots.
try:
    aggregator = ResultsAggregator(
        root_folder='HFE_ML_experiments',
        feature_names_csv='test_data_hfe_1yr_m_small_multiclass.csv')
    results_df = aggregator.aggregate_all_runs()

    # 2. Instantiate the MasterPlotter with your data
    master_plotter = MasterPlotter(results_df)

    # 3. Call the plot_all() method to generate all visualizations
    #    You can customize the primary metric and other options.
    master_plotter.plot_all(metric='auc', stratify_by_outcome=True)

except (ValueError, FileNotFoundError) as e:
    print(f"An error occurred: {e}")
    print("Please ensure your results folder path is correct and contains valid run data.")

