In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon

import warnings
warnings.filterwarnings("ignore")

In [24]:
# Load Random Search results from best_models_summary.csv
rs_best_models = pd.read_csv('RandomForestData/best_models_summary.csv')

# Extract baseline brier scores (these are from RandomForest.ipynb baseline runs)
baseline_brier_scores = [0.1892, 0.1819, 0.1067, 0.1520]  # Bank, Gym, Heart Disease, Titanic

# Extract Random Search best parameters and test scores for each dataset
import ast

random_brier_adj = []
random_best_params = []

for dataset_name in ['Bank', 'Gym', 'Heart Disease', 'Titanic']:
    row = rs_best_models[rs_best_models['dataset'] == dataset_name].iloc[0]
    random_brier_adj.append(row['brier_score'])
    random_best_params.append(ast.literal_eval(row['params']))

print("Random Search - Loaded from files:")
for ds, brier in zip(['Bank', 'Gym', 'Heart Disease', 'Titanic'], random_brier_adj):
    print(f"  {ds}: {brier}")

Random Search - Loaded from files:
  Bank: 0.1858421833675464
  Gym: 0.1534136800030513
  Heart Disease: 0.1277791441181989
  Titanic: 0.1104537560930364


In [25]:
# Load Bayesian Optimization results from all_bayesian_results.csv
bayes_results = pd.read_csv('RandomForestData/all_bayesian_results.csv')

# Extract Bayesian best parameters and test scores for each dataset
bayes_brier_adj = []
bayes_best_params = []

for dataset_name in ['Bank', 'Gym', 'Heart Disease', 'Titanic']:
    # Filter for this dataset and completed trials
    ds_trials = bayes_results[bayes_results['dataset'] == dataset_name]
    ds_completed = ds_trials[ds_trials['state'] == 1]  # State 1 = COMPLETE
    
    # Get best trial (lowest brier_score)
    best_trial = ds_completed.loc[ds_completed['brier_score'].idxmin()]
    
    # Extract test brier score (unique per dataset)
    bayes_brier_adj.append(best_trial['test_brier_score'])
    
    # Extract parameters
    params = {
        'n_estimators': int(best_trial['n_estimators']),
        'criterion': best_trial['criterion'],
        'max_depth': None if pd.isna(best_trial['max_depth']) else int(best_trial['max_depth']),
        'min_samples_split': int(best_trial['min_samples_split']),
        'min_samples_leaf': int(best_trial['min_samples_leaf']),
        'max_features': best_trial['max_features'] if best_trial['max_features'] == 'sqrt' else float(best_trial['max_features']),
        'max_samples': None if pd.isna(best_trial['max_samples']) else float(best_trial['max_samples'])
    }
    bayes_best_params.append(params)

print("\nBayesian Optimization - Loaded from files:")
for ds, brier in zip(['Bank', 'Gym', 'Heart Disease', 'Titanic'], bayes_brier_adj):
    print(f"  {ds}: {brier}")


Bayesian Optimization - Loaded from files:
  Bank: 0.1843225378562384
  Gym: 0.1786213871377209
  Heart Disease: 0.1044106469027575
  Titanic: 0.1520592793866828


In [26]:
# Create random dict from loaded data
random = {
    'dataset': ['bank', 'gym', 'heart', 'titanic'],
    'brier_adj': random_brier_adj,
    'baseline_brier': baseline_brier_scores,
    'best_params': random_best_params
}

print("\nRandom Search dict created from files:")
print(f"  Datasets: {random['dataset']}")
print(f"  Brier scores: {random['brier_adj']}")


Random Search dict created from files:
  Datasets: ['bank', 'gym', 'heart', 'titanic']
  Brier scores: [np.float64(0.1858421833675464), np.float64(0.1534136800030513), np.float64(0.1277791441181989), np.float64(0.1104537560930364)]


In [27]:
# Create bayes dict from loaded data
bayes = {
    'dataset': ['bank', 'gym', 'heart', 'titanic'],
    'brier_adj': bayes_brier_adj,
    'baseline_brier': baseline_brier_scores,
    'best_params': bayes_best_params
}

print("\nBayesian Optimization dict created from files:")
print(f"  Datasets: {bayes['dataset']}")
print(f"  Brier scores: {bayes['brier_adj']}")


Bayesian Optimization dict created from files:
  Datasets: ['bank', 'gym', 'heart', 'titanic']
  Brier scores: [np.float64(0.1843225378562384), np.float64(0.1786213871377209), np.float64(0.1044106469027575), np.float64(0.1520592793866828)]


In [28]:
bayes_df = pd.DataFrame(bayes)
random_df = pd.DataFrame(random)

In [29]:
X1 = pd.read_csv("preprocessed_datasets/bank_data.csv")
y1 = pd.read_csv("preprocessed_datasets/bank_target.csv").squeeze()
X2 = pd.read_csv("preprocessed_datasets/gym_data.csv")
y2 = pd.read_csv("preprocessed_datasets/gym_target.csv").squeeze()
X3 = pd.read_csv("preprocessed_datasets/heartDisease_data.csv")
y3 = pd.read_csv("preprocessed_datasets/heartDisease_target.csv").squeeze()
X4 = pd.read_csv("preprocessed_datasets/titanic_data.csv")
y4 = pd.read_csv("preprocessed_datasets/titanic_target.csv").squeeze()

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42, stratify=y3)
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=42, stratify=y4)


datasets = [(X1_train, X1_test, y1_train, y1_test),
            (X2_train, X2_test, y2_train, y2_test),
            (X3_train, X3_test, y3_train, y3_test),
            (X4_train, X4_test, y4_train, y4_test)]

In [41]:
df = pd.concat([random_df, bayes_df])
df

Unnamed: 0,dataset,brier_adj,baseline_brier,best_params
0,bank,0.185842,0.1892,"{'criterion': 'entropy', 'max_depth': None, 'm..."
1,gym,0.153414,0.1819,"{'criterion': 'log_loss', 'max_depth': None, '..."
2,heart,0.127779,0.1067,"{'criterion': 'entropy', 'max_depth': 10, 'max..."
3,titanic,0.110454,0.152,"{'criterion': 'entropy', 'max_depth': None, 'm..."
0,bank,0.184323,0.1892,"{'n_estimators': 1375, 'criterion': 'entropy',..."
1,gym,0.178621,0.1819,"{'n_estimators': 412, 'criterion': 'log_loss',..."
2,heart,0.104411,0.1067,"{'n_estimators': 1449, 'criterion': 'entropy',..."
3,titanic,0.152059,0.152,"{'n_estimators': 1216, 'criterion': 'log_loss'..."


In [35]:
for i in range(4):
    bayes_params = bayes_best_params[i]
    random_params = random_best_params[i]
    
    model = RandomForestClassifier(**bayes_params, random_state=42, n_jobs=-1)
    model.fit(datasets[i][0], datasets[i][2])
    print(f"bayes:    {brier_score_loss(datasets[i][3], model.predict_proba(datasets[i][1])[:, 1]) } ")


    model = RandomForestClassifier(**random_params, random_state=42, n_jobs=-1)
    model.fit(datasets[i][0], datasets[i][2])
    print(f"random:    {brier_score_loss(datasets[i][3], model.predict_proba(datasets[i][1])[:, 1]) } ")



bayes:    0.1843225378562384 
random:    0.18471559600566623 
bayes:    0.17862138713772094 
random:    0.17750083281720883 
bayes:    0.1044106469027575 
random:    0.10497195672091833 
bayes:    0.1520592793866828 
random:    0.15155394282929383 
