In [None]:
import os
import subprocess
import warnings
import sys
import pandas as pd
from hyperopt import fmin, tpe, hp
from hyperopt.pyll import scope
os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
warnings.filterwarnings('ignore') 
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# Boolean flag to control CPU core binding
limit_cpu_cores = False

if limit_cpu_cores:
    # Get the current process ID
    pid = os.getpid()
    print(f"Notebook PID: {pid}")

    # Define the CPU cores to bind (e.g., cores 0-3)
    core_range = "0-3"

    # Use taskset to bind the current process to specific CPU cores
    try:
        # Execute taskset command
        subprocess.run(["taskset", "-cp", core_range, str(pid)], check=True)
        print(f"Successfully bound Notebook PID {pid} to CPU cores {core_range}.")
    except FileNotFoundError:
        print("Error: 'taskset' command not found. Please ensure it is installed.")
    except subprocess.CalledProcessError as e:
        print(f"Error while setting CPU affinity: {e}")
else:
    print("CPU core binding is disabled.")


In [None]:
from ml_grid.util import grid_param_space

In [None]:
import ipywidgets as ipw
output = ipw.Output()

In [None]:
grid = {
            
            'resample' : ['undersample', 'oversample', None],
            'scale'    : [True, False],
            'feature_n': [100, 95, 75, 50, 25, 5],
            'param_space_size':['medium', 'xsmall'],
            'n_unique_out': [10],
            'outcome_var_n':['1'],
                            'percent_missing':[99, 95, 80],  #n/100 ex 95 for 95% # 99.99, 99.5, 9
                            'corr':[0.98, 0.85, 0.5, 0.25],
                            'data':[{'age':[True, False],
                                    'sex':[True, False],
                                    'bmi':[True],
                                    'ethnicity':[True, False],
                                    'bloods':[True, False],
                                    'diagnostic_order':[True, False],
                                    'drug_order':[True, False],
                                    'annotation_n':[True, False],
                                    'meta_sp_annotation_n':[True, False],
                                    'annotation_mrc_n':[True, False],
                                    'meta_sp_annotation_mrc_n':[True, False],
                                    'core_02':[False],
                                    'bed':[False],
                                    'vte_status':[True],
                                    'hosp_site':[True],
                                    'core_resus':[False],
                                    'news':[False],
                                    'date_time_stamp':[ False]
                                    
                                    }]
        }

In [None]:


space = {
    'resample': hp.choice('resample', ['undersample', 'oversample', None]),
    'scale': hp.choice('scale', [True, False]),
    'feature_n': hp.choice('feature_n', [100, 95, 75, 50, 25, 5]),
    'param_space_size': hp.choice('param_space_size', ['medium', 'xsmall']),
    'n_unique_out': hp.choice('n_unique_out', [10]),
    'outcome_var_n': hp.choice('outcome_var_n', ['1']),
    'percent_missing': hp.choice('percent_missing', [99, 95, 80]),
    'corr': hp.choice('corr', [0.98, 0.85, 0.5, 0.25]),
    'feature_selection_method': hp.choice('feature_selection_method', ['anova', 'markov_blanket']),
    'data': {
        'age': hp.choice('age', [True, False]),
        'sex': hp.choice('sex', [True, False]),
        'bmi': hp.choice('bmi', [True]),
        'ethnicity': hp.choice('ethnicity', [True, False]),
        'bloods': hp.choice('bloods', [True, False]),
        'diagnostic_order': hp.choice('diagnostic_order', [True, False]),
        'drug_order': hp.choice('drug_order', [True, False]),
        'annotation_n': hp.choice('annotation_n', [True, False]),
        'meta_sp_annotation_n': hp.choice('meta_sp_annotation_n', [True, False]),
        'annotation_mrc_n': hp.choice('annotation_mrc_n', [True, False]),
        'meta_sp_annotation_mrc_n': hp.choice('meta_sp_annotation_mrc_n', [True, False]),
        'core_02': hp.choice('core_02', [False]),
        'bed': hp.choice('bed', [False]),
        'vte_status': hp.choice('vte_status', [True]),
        'hosp_site': hp.choice('hosp_site', [True]),
        'core_resus': hp.choice('core_resus', [False]),
        'news': hp.choice('news', [False]),
        'date_time_stamp': hp.choice('date_time_stamp', [False]),
        'appointments': hp.choice('appointments', [False])
    }
}


In [None]:
# Breast cancer sample space:

space_breast_cancer = {
    'resample': hp.choice('resample', ['undersample', 'oversample', None]),
    'scale': hp.choice('scale', [True, False]),
    'feature_n': hp.choice('feature_n', [ 25, 5]),
    'param_space_size': hp.choice('param_space_size', ['medium', 'xsmall']),
    'n_unique_out': hp.choice('n_unique_out', [10]),
    'outcome_var_n': hp.choice('outcome_var_n', ['1']), # Optimise for alternate representations of outcome variable.
    'percent_missing': hp.choice('percent_missing', [99, 95, 80]),
    'corr': hp.choice('corr', [0.98, 0.85, 0.5, 0.25]),
    'data': {
        'age': hp.choice('age', [False]),
        'sex': hp.choice('sex', [ False]),
        'bmi': hp.choice('bmi', [False]),
        'ethnicity': hp.choice('ethnicity', [ False]),
        'bloods': hp.choice('bloods', [True, ]),
        'diagnostic_order': hp.choice('diagnostic_order', [ False]),
        'drug_order': hp.choice('drug_order', [ False]),
        'annotation_n': hp.choice('annotation_n', [ False]),
        'meta_sp_annotation_n': hp.choice('meta_sp_annotation_n', [ False]),
        'annotation_mrc_n': hp.choice('annotation_mrc_n', [ False]),
        'meta_sp_annotation_mrc_n': hp.choice('meta_sp_annotation_mrc_n', [ False]),
        'core_02': hp.choice('core_02', [False]),
        'bed': hp.choice('bed', [False]),
        'vte_status': hp.choice('vte_status', [False]),
        'hosp_site': hp.choice('hosp_site', [False]),
        'core_resus': hp.choice('core_resus', [False]),
        'news': hp.choice('news', [False]),
        'date_time_stamp': hp.choice('date_time_stamp', [False]),
    }
}

#### Setup the logger

In [None]:
from ml_grid.util.logger_setup import setup_logger
import logging

enable_logging = False

if(enable_logging):
    logger = logging.getLogger('matplotlib.font_manager')

# Set the logging level to suppress debug messages
    logger.setLevel(logging.INFO)

In [None]:
# Optionally exclude model classes

model_class_dict = {
        "LogisticRegression_class": True,
        "knn_classifiers_class": True,
        "quadratic_discriminant_analysis_class": True,
        "SVC_class": True,
        "XGB_class_class": True,
        "mlp_classifier_class": True,
        "RandomForestClassifier_class": True,
        "GradientBoostingClassifier_class": True,
        "CatBoost_class": True,
        "GaussianNB_class": True,
        "LightGBMClassifierWrapper": True,
        "adaboost_class": True,
        "kerasClassifier_class": True,
        "knn__gpu_wrapper_class": True,
        "NeuralNetworkClassifier_class": True,
        "TabTransformer_class": False,
    }

In [None]:
import ml_grid
import pathlib
import datetime
from tqdm import tqdm
import random
from IPython.display import clear_output
import pandas as pd


from ml_grid.model_classes.h2o_classifier_class import h2o_classifier_class
from ml_grid.util.project_score_save import project_score_save_class

from ml_grid.pipeline.data import pipe

from ml_grid.util.param_space import ParamSpace

random.seed(1234)

base_project_dir_global = 'HFE_ML_experiments/'

multiple_outcomes_example = True

if(enable_logging):
    logger = setup_logger(log_folder_path = base_project_dir_global)

    # Create a logger
    logger = logging.getLogger(__name__)

    # Add a filter to exclude logs not related to numba.core.byteflow
    class ByteflowFilter(logging.Filter):
        def filter(self, record):
            return record.name.startswith('numba.core.byteflow')

    # Add the filter to the logger
    logger.addFilter(ByteflowFilter())

pathlib.Path(base_project_dir_global).mkdir(parents=True, exist_ok=True) 

st_time = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M-%S_%p")

base_project_dir = 'HFE_ML_experiments/' + st_time + "/"
additional_naming = "HFE_ML_Grid_"

print(base_project_dir)

pathlib.Path(base_project_dir).mkdir(parents=True, exist_ok=True) 

if(multiple_outcomes_example == False):
    input_csv_path = 'breast_cancer_dataset.csv'

else:
    input_csv_path = 'test_data_hfe_1yr_m_small_multiclass.csv'

#input_csv_path = os.path.join('..', 'gloabl_files', 'ml_binary_classification_gridsearch_hyperOpt', 'notebooks' ,'test_data_hfe_1yr_m_small.csv') #large

#init csv to store each local projects results

project_score_save_class(base_project_dir)

n_iter = 1000

grid_iter_obj = grid_param_space.Grid(sample_n=n_iter).settings_list_iterator


def objective(local_param_dict, outcome_var=None):
    clear_output()
    #get settings from iterator over grid of settings space
    #local_param_dict = next(grid_iter_obj)
    print(local_param_dict)
    
    #init random number string
    
    idx = random.randint(0,999999999999999999999)

    #create object from settings
    ml_grid_object = pipe(input_csv_path,
                                                drop_term_list=['chrom', 'hfe', 'phlebo'],
                                                local_param_dict=local_param_dict,
                                                base_project_dir = base_project_dir,
                                                additional_naming = additional_naming,
                                                test_sample_n = 0,
                                                param_space_index = idx,
                                                model_class_dict = model_class_dict,
                                                outcome_var_override = outcome_var
                                                #outcome_var_override = None #override outcome var, example = 'outcome_var_myeloma'
                                                #outcome_var_override = outcome_var_list[outcome_index] # set if multi class ##deprecated
                                                )

    from ml_grid.pipeline import main
    
    
    # from ml_grid.model_classes.h2o_classifier_class import h2o_classifier_class

    # Example overwrite/append model_class list
    # temp_param_space_size = ParamSpace(ml_grid_object.local_param_dict.get("param_space_size"))

    # ml_grid_object.model_class_list = [h2o_classifier_class(
    #             X=ml_grid_object.X_train,
    #             y=ml_grid_object.y_train,
    #             parameter_space_size=temp_param_space_size,
    #         )]

    # Example append 
    # if(ml_grid_object.time_series_mode == False):
    #temp_param_space_size = ParamSpace(ml_grid_object.local_param_dict.get("param_space_size"))

    #     ml_grid_object.model_class_list.extend([h2o_classifier_class(
    #                 X=ml_grid_object.X_train,
    #                 y=ml_grid_object.y_train,
    #                 parameter_space_size=temp_param_space_size,
    #             )])

    #pass object to be evaluated and write results to csv
    errors, highest_score = main.run(ml_grid_object, local_param_dict=local_param_dict).execute()
    
    results_df = pd.read_csv(base_project_dir + 'final_grid_score_log.csv')
    
    #highest_metric_from_run = results_df[results_df['i'] == str(idx)].sort_values(by='auc')['auc'].iloc[-1]
    
    highest_metric_from_run = highest_score # for hyperopt multi procesess #AUC
    
    #display(results_df[results_df['i'] == str(idx)].sort_values(by='auc').iloc[0])
    
    result = {
        "loss": 1-float(highest_metric_from_run),
        "status": "ok"  # Indicate that the evaluation was successful
    }
    return result
    
     

In [None]:
#objective(next(grid_iter_obj))

In [None]:
from ml_grid.util.global_params import global_parameters

# print all attributes and their values
print(vars(global_parameters))

if global_parameters.debug_level > 1:
        print("Debug Mode: Additional logging enabled.")

# Update global parameters
#global_parameters.update_parameters(debug_level=0, grid_n_jobs = -1, error_raise = True, max_param_space_iter_value=1 )

In [None]:
from ml_grid.util.global_params import global_parameters

#print all attributes and their values

print(vars(global_parameters))

In [None]:
# results_df = pd.read_csv(base_project_dir + 'final_grid_score_log.csv')
    
# highest_metric_from_run = results_df[results_df['i'] == str(900424809465212743016)].sort_values(by='auc')['auc'].iloc[-1]

# highest_metric_from_run

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
trials = Trials()

In [None]:
from functools import partial

In [None]:
#%%prun
if( multiple_outcomes_example == False):

    # Fix the additional argument (outcome_var) using partial
    outcome_var = 'outcome_var_1'  # Define your outcome_var
    objective_with_outcome = partial(objective, outcome_var=outcome_var)

    # Initialize Trials object to store results
    trials = Trials()

    # Run the optimization
    best = fmin(
        fn=objective_with_outcome,  # Use the partial function
        space=space,
        algo=tpe.suggest,
        max_evals=1,
        trials=trials,
        verbose=1
    )

    print("Best hyperparameters:", best)

In [None]:
if( multiple_outcomes_example == False):
    best

In [None]:
if( multiple_outcomes_example == False):
    results_df = pd.read_csv(base_project_dir + 'final_grid_score_log.csv')

In [None]:
if( multiple_outcomes_example == False):
    results_df.sort_values('auc', ascending=False).iloc[0]

In [None]:
if( multiple_outcomes_example == False):
    results_df.sort_values('auc', ascending=False)

In [None]:
if( multiple_outcomes_example == True):
    
    dft = pd.read_csv('test_data_hfe_1yr_m_small_multiclass.csv', nrows=1)
    dft

In [None]:
# get outcome variables by finding prefix "outcome_var_" in column list

if( multiple_outcomes_example == True):
    outcome_var_list = [dft.columns[i] for i in range(len(dft.columns)) if "outcome_var_" in dft.columns[i]]

    outcome_var_list

In [None]:
# Multiple outcomes one vs rest

In [None]:
outcome_var_list

In [None]:
#%%prun
if( multiple_outcomes_example == True):
    
    import multiprocessing
    from datetime import datetime
    from hyperopt import fmin, tpe, Trials
    from joblib import Parallel, delayed
    from joblib import parallel_backend  # Correct import for parallel_backend\n

    # Get number of cores
    num_cores = multiprocessing.cpu_count()

    def process_single_outcome(outcome_index, outcome_var_list):
        """Process a single outcome index using multiprocessing."""
        outcome_var = outcome_var_list[outcome_index]
        start_time = datetime.now()
        print(f"[{start_time}] Starting outcome {outcome_index}: {outcome_var}")

        # Wrap objective to include the outcome_var
        def objective_with_outcome(params):
            print(f"Evaluating params: {params} for outcome {outcome_var}")
            return objective(params, outcome_var)

        try:
            # Use joblib's multiprocessing backend for scikit-learn operations
            with parallel_backend('multiprocessing', n_jobs=1):
                best = fmin(
                    fn=objective_with_outcome,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=1,
                    trials=Trials(),
                    verbose=0
                )
            end_time = datetime.now()
            print(f"[{end_time}] Finished outcome {outcome_index} (Duration: {end_time - start_time})")
            return outcome_index, best, None
       

        except Exception as e:
            print(f"Error in outcome {outcome_var}: {str(e)}")
            #raise e
            return outcome_index, None, str(e)

    # Main execution
    if __name__ == "__main__":
        start_total = datetime.now()
        print(f"Starting all optimizations at {start_total}")

        # Use joblib's Parallel for multiprocessing
        results = Parallel(n_jobs=num_cores)(
            delayed(process_single_outcome)(i, outcome_var_list)
            for i in range(len(outcome_var_list))
        )

        # Process results
        for outcome_index, best, error in results:
            if error:
                print(f"Exception on fmin for {outcome_var_list[outcome_index]}: {error}")
            elif best is not None:
                print(f"Best parameters for {outcome_var_list[outcome_index]}: {best}")
            else:
                print(f"No result for {outcome_var_list[outcome_index]}")

        end_total = datetime.now()
        print(f"\nCompleted all optimizations at {end_total}")
        print(f"Total duration: {end_total - start_total}")

In [None]:
import pandas as pd 

import os
import pandas as pd
from datetime import datetime

# Define the parent directory
parent_dir = 'HFE_ML_experiments'

# List all folders in the parent directory that match the date pattern
folders = [f for f in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, f))]

# Parse folder names as dates and find the latest one
def parse_date(folder_name):
    try:
        return datetime.strptime(folder_name, '%Y-%m-%d_%I-%M-%S_%p')
    except ValueError:
        return None  # Skip folders that don't match the format

# Filter and sort folders by date
folders_with_dates = [(f, parse_date(f)) for f in folders]
folders_with_dates = [f for f in folders_with_dates if f[1] is not None]
latest_folder = max(folders_with_dates, key=lambda x: x[1])[0]  # Get the folder with the latest date

print("latest_folder",latest_folder)

# Construct the path to the CSV file in the latest folder
csv_path = os.path.join(parent_dir, latest_folder, 'final_grid_score_log.csv')

# Load the CSV file
df = pd.read_csv(csv_path)

# Sort the DataFrame by 'auc' column in descending order
df = df.sort_values(by='auc', ascending=False)

print(len(df))

# group by outcome_variable and display the first row of each group with the highest auc
df = df.groupby('outcome_variable').apply(lambda x: x.iloc[0])
# Display the result
df.head()




In [None]:
import pandas as pd
import numpy as np

# Load the data
data_path = 'test_data_hfe_1yr_m_small_multiclass.csv'
data = pd.read_csv(data_path)

# Display basic information about the dataset
print("=== Dataset Information ===")
print(f"Shape of the dataset: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Check for missing values
print("\n=== Missing Values ===")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Check for constant features
print("\n=== Constant Features ===")
constant_features = [col for col in data.columns if data[col].nunique() == 1]
print(f"Constant features: {constant_features}")

# Check for features with very low variance (almost constant)
print("\n=== Low Variance Features ===")
low_variance_features = []
for col in data.columns:
    if data[col].dtype in [np.float64, np.int64]:  # Check only numeric features
        if data[col].std() < 0.01:  # Threshold for low variance
            low_variance_features.append(col)
print(f"Low variance features: {low_variance_features}")

# Check for duplicate rows
print("\n=== Duplicate Rows ===")
duplicate_rows = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Check for class distribution (if it's a classification problem)
if 'target' in data.columns:  # Replace 'target' with your actual target column name
    print("\n=== Class Distribution ===")
    print(data['target'].value_counts())

# Check for categorical features with high cardinality
print("\n=== High Cardinality Categorical Features ===")
categorical_features = data.select_dtypes(include=['object', 'category']).columns
high_cardinality_features = [col for col in categorical_features if data[col].nunique() > 100]
print(f"High cardinality categorical features: {high_cardinality_features}")

# Check for outliers in numeric features (using IQR)
print("\n=== Outliers in Numeric Features ===")
numeric_features = data.select_dtypes(include=[np.float64, np.int64]).columns
for col in numeric_features:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    if not outliers.empty:
        print(f"Outliers in {col}: {len(outliers)} rows")

# Summary of issues
print("\n=== Summary of Issues ===")
if missing_values.any():
    print(f"- Missing values found in {missing_values[missing_values > 0].index.tolist()}")
if constant_features:
    print(f"- Constant features found: {constant_features}")
if low_variance_features:
    print(f"- Low variance features found: {low_variance_features}")
if duplicate_rows > 0:
    print(f"- Duplicate rows found: {duplicate_rows}")
if high_cardinality_features:
    print(f"- High cardinality categorical features found: {high_cardinality_features}")
if not missing_values.any() and not constant_features and not low_variance_features and not duplicate_rows and not high_cardinality_features:
    print("- No major issues found in the dataset.")

In [None]:
import pandas as pd 

import os
import pandas as pd
from datetime import datetime

# Define the parent directory
parent_dir = 'HFE_ML_experiments'

# List all folders in the parent directory that match the date pattern
folders = [f for f in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, f))]

# Parse folder names as dates and find the latest one
def parse_date(folder_name):
    try:
        return datetime.strptime(folder_name, '%Y-%m-%d_%I-%M-%S_%p')
    except ValueError:
        return None  # Skip folders that don't match the format

# Filter and sort folders by date
folders_with_dates = [(f, parse_date(f)) for f in folders]
folders_with_dates = [f for f in folders_with_dates if f[1] is not None]
latest_folder = max(folders_with_dates, key=lambda x: x[1])[0]  # Get the folder with the latest date

print("latest_folder",latest_folder)

# Construct the path to the CSV file in the latest folder
csv_path = os.path.join(parent_dir, latest_folder, 'final_grid_score_log.csv')

# Load the CSV file
df = pd.read_csv(csv_path)

# Sort the DataFrame by 'auc' column in descending order
df = df.sort_values(by='auc', ascending=False)

print(len(df))
# Display the result
df.head()




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is named 'df'

# Get the top result for each outcome_variable by AUC
top_auc_per_outcome = df.loc[df.groupby('outcome_variable')['auc'].idxmax()]

# Sort by AUC for better visualization
top_auc_per_outcome = top_auc_per_outcome.sort_values(by='auc', ascending=False)

# Set plot style
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Create barplot to show the top AUC for each outcome_variable
sns.barplot(
    x='auc', 
    y='outcome_variable', 
    data=top_auc_per_outcome, 
    hue='nb_size', 
    dodge=False, 
    palette='viridis'
)

# Add titles and labels
plt.title('Top AUC for Each Outcome Variable')
plt.xlabel('AUC')
plt.ylabel('Outcome Variable')
plt.legend(title='num features')

# Display the plot
plt.show()


In [None]:
print("done")

In [None]:
# Import the necessary classes
from ml_grid.results_processing.core import ResultsAggregator
from ml_grid.results_processing.plot_master import MasterPlotter
import pandas as pd

# 1. Load your data using the ResultsAggregator
#    Replace with the actual path to your results and feature names file.
#    The feature_names_csv is optional but required for feature-related plots.
try:
    aggregator = ResultsAggregator(
        root_folder='HFE_ML_experiments',
        feature_names_csv='test_data_hfe_1yr_m_small_multiclass.csv')
    results_df = aggregator.aggregate_all_runs()

    # 2. Instantiate the MasterPlotter with your data
    master_plotter = MasterPlotter(results_df)

    # 3. Call the plot_all() method to generate all visualizations
    #    You can customize the primary metric and other options.
    master_plotter.plot_all(metric='auc', stratify_by_outcome=True)

except (ValueError, FileNotFoundError) as e:
    print(f"An error occurred: {e}")
    print("Please ensure your results folder path is correct and contains valid run data.")

