In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from boruta import BorutaPy
import warnings
import os
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from joblib import dump, load


In [2]:
def correct_imbalance(X, y, method='SMOTE'):
    if method == 'SMOTE':
        resampler = SMOTE(random_state=42)
    elif method == 'undersampling':
        resampler = RandomUnderSampler(random_state=42)
    X_res, y_res = resampler.fit_resample(X, y)
    return X_res, y_res

def apply_boruta(X, y, max_depth=5, n_estimators='auto', random_state=42):
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=max_depth, random_state=random_state)
    boruta_selector = BorutaPy(rf, n_estimators=n_estimators, random_state=random_state)
    boruta_selector.fit(X.values, y.values)
    selected_features = X.columns[boruta_selector.support_].tolist()
    X_reduced = X.loc[:, selected_features]
    return X_reduced


def apply_pca(X, n_components=0.95):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X_scaled)
    return X_reduced

In [3]:
df = pd.read_csv('../ich_data_w_scores_modified.csv')

# Drop specified score columns and reset index
score_cols = ["oICH_score", "mICH_score", "ICH_GS_score", "LSICH_score", "ICH_FOS_score", "Max_ICH_score"]
df = df.drop(columns=score_cols).reset_index(drop=True)

# Features and targets
X = df.drop(columns=['MORT90', 'MRS90'])
y_mort90 = df['MORT90']
y_mrs90 = df['MRS90'].apply(lambda x: 0 if x <= 3 else 1)  # Binarizing MRS90

# Splitting the dataset into training and temporary test sets
X_train, X_test, y_train_mort90, y_test_mort90 = train_test_split(X, y_mort90, test_size=0.2, random_state=42)
X_train, X_test, y_train_mrs90, y_test_mrs90 = train_test_split(X, y_mrs90, test_size=0.2, random_state=42)


In [4]:
# Apply SMOTE and undersampling after splitting, only to training data
X_train_mort90_smote, y_train_mort90_smote = correct_imbalance(X_train, y_train_mort90, method='SMOTE')
X_train_mrs90_smote, y_train_mrs90_smote = correct_imbalance(X_train, y_train_mrs90, method='SMOTE')
X_train_mort90_undersample, y_train_mort90_undersample = correct_imbalance(X_train, y_train_mort90, method='undersampling')
X_train_mrs90_undersample, y_train_mrs90_undersample = correct_imbalance(X_train, y_train_mrs90, method='undersampling')


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Example of applying Boruta
rf = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=42)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)
boruta.fit(X_train_scaled, y_train_mort90)
X_train_boruta = boruta.transform(X_train_scaled)
X_test_boruta = boruta.transform(X_test_scaled)

In [5]:
# model_lr = LogisticRegression(max_iter=1000)
# param_grid_lr = {
#     'C': [0.01, 0.1, 1],  # Regularization strength
#     'solver': ['liblinear', 'lbfgs']  # Optimization algorithm
# }

# # Grid search using the validation set for scoring
# grid_search_lr = GridSearchCV(model_lr, param_grid_lr, cv=5, scoring='roc_auc')
# grid_search_lr.fit(X_train, y_train_mort90)

# # Print the best hyperparameters
# print("Best hyperparameters for Logistic Regression:")
# print(grid_search_lr.best_params_)

# # Convert the cv_results_ to a DataFrame
# results_df = pd.DataFrame(grid_search_lr.cv_results_)

# # Display the first few rows of the results
# print("Grid Search Results:")
# print(results_df[['param_C', 'param_solver', 'mean_test_score', 'std_test_score', 'rank_test_score']])

# # Evaluate on the validation set
# y_val_pred_lr = grid_search_lr.predict_proba(X_val)[:, 1]
# auc_val_lr = roc_auc_score(y_val_mort90, y_val_pred_lr)
# print(f'Logistic Regression Validation AUC: {auc_val_lr}')

# y_test_pred_lr = grid_search_lr.predict_proba(X_test)[:, 1]
# auc_test_lr = roc_auc_score(y_test_mort90, y_test_pred_lr)
# print(f'Logistic Regression Test AUC: {auc_test_lr}')

In [6]:
datasets = {
    "Original mort90": (X_train, y_train_mort90, X_test, y_test_mort90),
    "SMOTE mort90": (X_train_mort90_smote, y_train_mort90_smote,  X_test, y_test_mort90),
    "Undersampled mort90": (X_train_mort90_undersample, y_train_mort90_undersample,  X_test, y_test_mort90),
    "Boruta mort90": (X_train_boruta, y_train_mort90,  X_test_boruta, y_test_mort90),
    "PCA mort90": (X_train_pca, y_train_mort90,  X_test_pca, y_test_mort90),
    "Original mrs90": (X_train, y_train_mrs90, X_test, y_test_mrs90),
    "SMOTE mrs90": (X_train_mrs90_smote, y_train_mrs90_smote, X_test, y_test_mrs90),
    "Undersampled mrs90": (X_train_mrs90_undersample, y_train_mrs90_undersample,X_test, y_test_mrs90),
    "Boruta mrs90": (X_train_boruta, y_train_mrs90, X_test_boruta, y_test_mrs90),
    "PCA mrs90": (X_train_pca, y_train_mrs90, X_test_pca, y_test_mrs90)
}


In [11]:
# Grid search setup
param_grid_lr = {
    'C': [0.01, 0.1, 1],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Optimization algorithm
}

# Initialize a DataFrame to store results
results_columns = ['Target', 'Dataset', 'Best Params', 'Test AUC']
results_df = pd.DataFrame(columns=results_columns)

# Track the best model
best_auc = 0
best_model_info = {}

for name, (X_train, y_train, X_test, y_test) in datasets.items():
    model_lr = LogisticRegression(max_iter=1000)
    grid_search_lr = GridSearchCV(model_lr, param_grid_lr, cv=5, scoring='roc_auc')
    grid_search_lr.fit(X_train, y_train)

    # Best parameters
    best_params = grid_search_lr.best_params_

    # Evaluation on the test set
    y_test_pred_lr = grid_search_lr.predict_proba(X_test)[:, 1]
    y_pred = (y_test_pred_lr>= 0.5).astype(int)
    auc_test_lr = roc_auc_score(y_test, y_test_pred_lr)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Save results
    new_row = pd.DataFrame({
        'Target': ['mrs90' if 'mrs90' in name else 'mort90'],
        'Dataset': [name],
        'Best Params': [best_params],
        'Test AUC': [auc_test_lr],
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    })
    results_df = pd.concat([results_df, new_row], ignore_index=True)

    # Update best model if this one is better
    if auc_test_lr > best_auc:
        best_auc = auc_test_lr
        best_model_info = {
            'model': grid_search_lr.best_estimator_,
            'dataset': name,
            'target': 'mrs90' if 'mrs90' in name else 'mort90',
            'AUC': auc_test_lr,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

# Print all results
print(results_df)

# # Save the best model
# if best_model_info:
#     dump(best_model_info['model'], f"best_model_{best_model_info['target']}_{best_model_info['dataset']}.joblib")
#     print(f"Best model saved from dataset: {best_model_info['dataset']} for target {best_model_info['target']} with Test AUC: {best_auc}")

   Target              Dataset                         Best Params  Test AUC  \
0  mort90      Original mort90       {'C': 0.1, 'solver': 'lbfgs'}  0.889171   
1  mort90         SMOTE mort90     {'C': 1, 'solver': 'liblinear'}  0.832977   
2  mort90  Undersampled mort90         {'C': 1, 'solver': 'lbfgs'}  0.869178   
3  mort90        Boruta mort90      {'C': 0.01, 'solver': 'lbfgs'}  0.865215   
4  mort90           PCA mort90      {'C': 0.01, 'solver': 'lbfgs'}  0.886143   
5   mrs90       Original mrs90   {'C': 0.1, 'solver': 'liblinear'}  0.888946   
6   mrs90          SMOTE mrs90   {'C': 0.1, 'solver': 'liblinear'}  0.879703   
7   mrs90   Undersampled mrs90  {'C': 0.01, 'solver': 'liblinear'}  0.880895   
8   mrs90         Boruta mrs90  {'C': 0.01, 'solver': 'liblinear'}  0.884366   
9   mrs90            PCA mrs90  {'C': 0.01, 'solver': 'liblinear'}  0.871365   

   Precision    Recall        F1  
0   0.622222  0.491228  0.549020  
1   0.460526  0.614035  0.526316  
2   0.325926  

In [12]:
results_df.to_csv('lr_results.csv')

In [13]:
# Parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.3]  # Adding learning rate to the grid
}

# Initialize a DataFrame to store results
if 'results_df' not in locals():
    results_columns = ['Target', 'Dataset', 'Model', 'Best Params', 'Test AUC', 'Precision', 'Recall', 'F1']
    results_df = pd.DataFrame(columns=results_columns)

# Track the best model for XGBoost
best_auc_xgb = 0
best_model_info_xgb = {}

for name, (X_train, y_train, X_test, y_test) in datasets.items():
    grid_search_xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        xgb_param_grid,
        cv=5,
        scoring='roc_auc'
    )
    grid_search_xgb.fit(X_train, y_train)

    # Best parameters
    best_params_xgb = grid_search_xgb.best_params_

    # Evaluation on the test set
    y_test_pred_xgb = grid_search_xgb.predict_proba(X_test)[:, 1]
    y_pred_xgb = (y_test_pred_xgb >= 0.5).astype(int)
    auc_test_xgb = roc_auc_score(y_test, y_test_pred_xgb)
    precision_xgb = precision_score(y_test, y_pred_xgb)
    recall_xgb = recall_score(y_test, y_pred_xgb)
    f1_xgb = f1_score(y_test, y_pred_xgb)

    # Save results
    new_row_xgb = pd.DataFrame({
        'Target': ['mrs90' if 'mrs90' in name else 'mort90'],
        'Dataset': [name],
        'Model': ['XGBoost'],
        'Best Params': [best_params_xgb],
        'Test AUC': [auc_test_xgb],
        'Precision': [precision_xgb],
        'Recall': [recall_xgb],
        'F1': [f1_xgb]
    })
    results_df = pd.concat([results_df, new_row_xgb], ignore_index=True)

    # Update best model if this one is better
    if auc_test_xgb > best_auc_xgb:
        best_auc_xgb = auc_test_xgb
        best_model_info_xgb = {
            'model': grid_search_xgb.best_estimator_,
            'dataset': name,
            'target': 'mrs90' if 'mrs90' in name else 'mort90',
            'AUC': auc_test_xgb,
            'Precision': precision_xgb,
            'Recall': recall_xgb,
            'F1': f1_xgb
        }

# Print all results
print(results_df)

# # Optionally save the best XGBoost model
# if best_model_info_xgb:
#     dump(best_model_info_xgb['model'], f"best_xgboost_model_{best_model_info_xgb['target']}_{best_model_info_xgb['dataset']}.joblib")
#     print(f"Best XGBoost model saved from dataset: {best_model_info_xgb['dataset']} for target {best_model_info_xgb['target']} with Test AUC: {best_auc_xgb}")

results_df.to_csv('xgb_results.csv')

    Target              Dataset  \
0   mort90      Original mort90   
1   mort90         SMOTE mort90   
2   mort90  Undersampled mort90   
3   mort90        Boruta mort90   
4   mort90           PCA mort90   
5    mrs90       Original mrs90   
6    mrs90          SMOTE mrs90   
7    mrs90   Undersampled mrs90   
8    mrs90         Boruta mrs90   
9    mrs90            PCA mrs90   
10  mort90      Original mort90   
11  mort90         SMOTE mort90   
12  mort90  Undersampled mort90   
13  mort90        Boruta mort90   
14  mort90           PCA mort90   
15   mrs90       Original mrs90   
16   mrs90          SMOTE mrs90   
17   mrs90   Undersampled mrs90   
18   mrs90         Boruta mrs90   
19   mrs90            PCA mrs90   

                                          Best Params  Test AUC  Precision  \
0                       {'C': 0.1, 'solver': 'lbfgs'}  0.889171   0.622222   
1                     {'C': 1, 'solver': 'liblinear'}  0.832977   0.460526   
2                         {'C'

In [14]:
# Parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10]
}

# Initialize a DataFrame to store results
if 'results_df_rf' not in locals():
    results_columns_rf = ['Target', 'Dataset', 'Model', 'Best Params', 'Test AUC', 'Precision', 'Recall', 'F1']
    results_df_rf = pd.DataFrame(columns=results_columns_rf)

# Track the best model for Random Forest
best_auc_rf = 0
best_model_info_rf = {}

for name, (X_train, y_train, X_test, y_test) in datasets.items():
    grid_search_rf = GridSearchCV(
        RandomForestClassifier(),
        rf_param_grid,
        cv=5,
        scoring='roc_auc'
    )
    grid_search_rf.fit(X_train, y_train)

    # Best parameters
    best_params_rf = grid_search_rf.best_params_

    # Evaluation on the test set
    y_test_pred_rf = grid_search_rf.predict_proba(X_test)[:, 1]
    y_pred_rf = (y_test_pred_rf >= 0.5).astype(int)
    auc_test_rf = roc_auc_score(y_test, y_test_pred_rf)
    precision_rf = precision_score(y_test, y_pred_rf)
    recall_rf = recall_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf)

    # Save results
    new_row_rf = pd.DataFrame({
        'Target': ['mrs90' if 'mrs90' in name else 'mort90'],
        'Dataset': [name],
        'Model': ['Random Forest'],
        'Best Params': [best_params_rf],
        'Test AUC': [auc_test_rf],
        'Precision': [precision_rf],
        'Recall': [recall_rf],
        'F1': [f1_rf]
    })
    results_df_rf = pd.concat([results_df_rf, new_row_rf], ignore_index=True)

    # Update best model if this one is better
    if auc_test_rf > best_auc_rf:
        best_auc_rf = auc_test_rf
        best_model_info_rf = {
            'model': grid_search_rf.best_estimator_,
            'dataset': name,
            'target': 'mrs90' if 'mrs90' in name else 'mort90',
            'AUC': auc_test_rf,
            'Precision': precision_rf,
            'Recall': recall_rf,
            'F1': f1_rf
        }

# Print all results
print(results_df_rf)

# Optionally save the best Random Forest model
# if best_model_info_rf:
#     dump(best_model_info_rf['model'], f"best_random_forest_model_{best_model_info_rf['target']}_{best_model_info_rf['dataset']}.joblib")
#     print(f"Best Random Forest model saved from dataset: {best_model_info_rf['dataset']} for target {best_model_info_rf['target']} with Test AUC: {best_auc_rf}")

results_df_rf.to_csv('rf_results.csv')


   Target              Dataset          Model  \
0  mort90      Original mort90  Random Forest   
1  mort90         SMOTE mort90  Random Forest   
2  mort90  Undersampled mort90  Random Forest   
3  mort90        Boruta mort90  Random Forest   
4  mort90           PCA mort90  Random Forest   
5   mrs90       Original mrs90  Random Forest   
6   mrs90          SMOTE mrs90  Random Forest   
7   mrs90   Undersampled mrs90  Random Forest   
8   mrs90         Boruta mrs90  Random Forest   
9   mrs90            PCA mrs90  Random Forest   

                              Best Params  Test AUC  Precision    Recall  \
0  {'max_depth': 10, 'n_estimators': 200}  0.906537   0.733333  0.385965   
1  {'max_depth': 10, 'n_estimators': 200}  0.888637   0.514286  0.631579   
2  {'max_depth': 10, 'n_estimators': 200}  0.868688   0.350000  0.859649   
3   {'max_depth': 3, 'n_estimators': 200}  0.881957   0.840000  0.368421   
4  {'max_depth': 10, 'n_estimators': 300}  0.875946   0.608696  0.245614   
5  {

In [15]:
# Parameter grid for MLP
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Initialize a DataFrame to store results
if 'results_df_mlp' not in locals():
    results_columns_mlp = ['Target', 'Dataset', 'Model', 'Best Params', 'Test AUC', 'Precision', 'Recall', 'F1']
    results_df_mlp = pd.DataFrame(columns=results_columns_mlp)

# Track the best model for MLP
best_auc_mlp = 0
best_model_info_mlp = {}

for name, (X_train, y_train, X_test, y_test) in datasets.items():
    grid_search_mlp = GridSearchCV(
        MLPClassifier(max_iter=1000),
        mlp_param_grid,
        cv=5,
        scoring='roc_auc'
    )
    grid_search_mlp.fit(X_train, y_train)

    # Best parameters
    best_params_mlp = grid_search_mlp.best_params_

    # Evaluation on the test set
    y_test_pred_mlp = grid_search_mlp.predict_proba(X_test)[:, 1]
    y_pred_mlp = (y_test_pred_mlp >= 0.5).astype(int)
    auc_test_mlp = roc_auc_score(y_test, y_test_pred_mlp)
    precision_mlp = precision_score(y_test, y_pred_mlp)
    recall_mlp = recall_score(y_test, y_pred_mlp)
    f1_mlp = f1_score(y_test, y_pred_mlp)

    # Save results
    new_row_mlp = pd.DataFrame({
        'Target': ['mrs90' if 'mrs90' in name else 'mort90'],
        'Dataset': [name],
        'Model': ['MLP'],
        'Best Params': [best_params_mlp],
        'Test AUC': [auc_test_mlp],
        'Precision': [precision_mlp],
        'Recall': [recall_mlp],
        'F1': [f1_mlp]
    })
    results_df_mlp = pd.concat([results_df_mlp, new_row_mlp], ignore_index=True)

    # Update best model if this one is better
    if auc_test_mlp > best_auc_mlp:
        best_auc_mlp = auc_test_mlp
        best_model_info_mlp = {
            'model': grid_search_mlp.best_estimator_,
            'dataset': name,
            'target': 'mrs90' if 'mrs90' in name else 'mort90',
            'AUC': auc_test_mlp,
            'Precision': precision_mlp,
            'Recall': recall_mlp,
            'F1': f1_mlp
        }

# Print all results
print(results_df_mlp)

# Optionally save the best MLP model
# if best_model_info_mlp:
#     dump(best_model_info_mlp['model'], f"best_mlp_model_{best_model_info_mlp['target']}_{best_model_info_mlp['dataset']}.joblib")
#     print(f"Best MLP model saved from dataset: {best_model_info_mlp['dataset']} for target {best_model_info_mlp['target']} with Test AUC: {best_auc_mlp}")

results_df_mlp.to_csv('mlp_results.csv')


   Target              Dataset Model  \
0  mort90      Original mort90   MLP   
1  mort90         SMOTE mort90   MLP   
2  mort90  Undersampled mort90   MLP   
3  mort90        Boruta mort90   MLP   
4  mort90           PCA mort90   MLP   
5   mrs90       Original mrs90   MLP   
6   mrs90          SMOTE mrs90   MLP   
7   mrs90   Undersampled mrs90   MLP   
8   mrs90         Boruta mrs90   MLP   
9   mrs90            PCA mrs90   MLP   

                                         Best Params  Test AUC  Precision  \
0  {'activation': 'tanh', 'alpha': 0.0001, 'hidde...  0.841794   0.500000   
1  {'activation': 'tanh', 'alpha': 0.001, 'hidden...  0.839478   0.438356   
2  {'activation': 'tanh', 'alpha': 0.001, 'hidden...  0.833200   0.298137   
3  {'activation': 'relu', 'alpha': 0.001, 'hidden...  0.858714   0.674419   
4  {'activation': 'tanh', 'alpha': 0.01, 'hidden_...  0.868287   0.600000   
5  {'activation': 'relu', 'alpha': 0.01, 'hidden_...  0.869003   0.781818   
6  {'activation': 't

In [18]:

# Grid search for MRS90
grid_search_lr_mrs90 = GridSearchCV(model_lr, param_grid_lr, cv=5, scoring='roc_auc')
grid_search_lr_mrs90.fit(X_train, y_train_mrs90)

# Print the best hyperparameters for MRS90
print("Best hyperparameters for Logistic Regression on MRS90:")
print(grid_search_lr_mrs90.best_params_)

# Convert the MRS90 cv_results_ to a DataFrame
results_df_mrs90 = pd.DataFrame(grid_search_lr_mrs90.cv_results_)
print("Grid Search Results for MRS90:")
print(results_df_mrs90[['param_C', 'param_solver', 'mean_test_score', 'std_test_score', 'rank_test_score']])

# Evaluate MRS90 on the validation set
y_val_pred_lr_mrs90 = grid_search_lr_mrs90.predict_proba(X_val)[:, 1]
auc_val_lr_mrs90 = roc_auc_score(y_val_mrs90, y_val_pred_lr_mrs90)
print(f'Logistic Regression Validation AUC for MRS90: {auc_val_lr_mrs90}')

# Evaluate MRS90 on the test set
y_test_pred_lr_mrs90 = grid_search_lr_mrs90.predict_proba(X_test)[:, 1]
auc_test_lr_mrs90 = roc_auc_score(y_test_mrs90, y_test_pred_lr_mrs90)
print(f'Logistic Regression Test AUC for MRS90: {auc_test_lr_mrs90}')

Best hyperparameters for Logistic Regression on MRS90:
{'C': 0.1, 'solver': 'liblinear'}
Grid Search Results for MRS90:
  param_C param_solver  mean_test_score  std_test_score  rank_test_score
0    0.01    liblinear         0.859068        0.015550                3
1    0.01        lbfgs         0.857581        0.016641                4
2     0.1    liblinear         0.860442        0.015051                1
3     0.1        lbfgs         0.860047        0.015224                2
4       1    liblinear         0.856395        0.019148                6
5       1        lbfgs         0.857054        0.018999                5
Logistic Regression Validation AUC for MRS90: 0.9177838113015442
Logistic Regression Test AUC for MRS90: 0.8642628205128206


In [24]:
# Parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
}

# Parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
}


In [25]:
# XGBoost Grid Search - MORT90
grid_search_xgb_mort90 = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    xgb_param_grid,
    cv=5,
    scoring='roc_auc'
)

grid_search_xgb_mort90.fit(X_train, y_train_mort90)
print("Best hyperparameters for XGBoost on MORT90:", grid_search_xgb_mort90.best_params_)

y_val_pred_xgb_mort90 = grid_search_xgb_mort90.predict_proba(X_val)[:, 1]
auc_val_xgb_mort90 = roc_auc_score(y_val_mort90, y_val_pred_xgb_mort90)
print(f'XGBoost Validation AUC for MORT90: {auc_val_xgb_mort90}')

y_test_pred_xgb_mort90 = grid_search_xgb_mort90.predict_proba(X_test)[:, 1]
auc_test_xgb_mort90 = roc_auc_score(y_test_mort90, y_test_pred_xgb_mort90)
print(f'XGBoost Test AUC for MORT90: {auc_test_xgb_mort90}')


Best hyperparameters for XGBoost on MORT90: {'max_depth': 3, 'n_estimators': 100}
XGBoost Validation AUC for MORT90: 0.8858198451794511
XGBoost Test AUC for MORT90: 0.8930375180375181


In [26]:
# XGBoost Grid Search - MRS90
grid_search_xgb_mrs90 = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    xgb_param_grid,
    cv=5,
    scoring='roc_auc'
)

grid_search_xgb_mrs90.fit(X_train, y_train_mrs90)
print("Best hyperparameters for XGBoost on MRS90:", grid_search_xgb_mrs90.best_params_)

y_val_pred_xgb_mrs90 = grid_search_xgb_mrs90.predict_proba(X_val)[:, 1]
auc_val_xgb_mrs90 = roc_auc_score(y_val_mrs90, y_val_pred_xgb_mrs90)
print(f'XGBoost Validation AUC for MRS90: {auc_val_xgb_mrs90}')

y_test_pred_xgb_mrs90 = grid_search_xgb_mrs90.predict_proba(X_test)[:, 1]
auc_test_xgb_mrs90 = roc_auc_score(y_test_mrs90, y_test_pred_xgb_mrs90)
print(f'XGBoost Test AUC for MRS90: {auc_test_xgb_mrs90}')

Best hyperparameters for XGBoost on MRS90: {'max_depth': 10, 'n_estimators': 100}
XGBoost Validation AUC for MRS90: 0.9046326149669099
XGBoost Test AUC for MRS90: 0.8352564102564103


In [27]:
# Random Forest Grid Search - MORT90
grid_search_rf_mort90 = GridSearchCV(
    RandomForestClassifier(),
    rf_param_grid,
    cv=5,
    scoring='roc_auc'
)
grid_search_rf_mort90.fit(X_train, y_train_mort90)
print("Best hyperparameters for Random Forest on MORT90:", grid_search_rf_mort90.best_params_)
y_val_pred_rf_mort90 = grid_search_rf_mort90.predict_proba(X_val)[:, 1]
auc_val_rf_mort90 = roc_auc_score(y_val_mort90, y_val_pred_rf_mort90)
print(f'Random Forest Validation AUC for MORT90: {auc_val_rf_mort90}')
y_test_pred_rf_mort90 = grid_search_rf_mort90.predict_proba(X_test)[:, 1]
auc_test_rf_mort90 = roc_auc_score(y_test_mort90, y_test_pred_rf_mort90)
print(f'Random Forest Test AUC for MORT90: {auc_test_rf_mort90}')


Best hyperparameters for Random Forest on MORT90: {'max_depth': 10, 'n_estimators': 200}
Random Forest Validation AUC for MORT90: 0.9210063335679098
Random Forest Test AUC for MORT90: 0.8959235209235209


In [28]:
# Random Forest Grid Search - MRS90
grid_search_rf_mrs90 = GridSearchCV(
    RandomForestClassifier(),
    rf_param_grid,
    cv=5,
    scoring='roc_auc'
)
grid_search_rf_mrs90.fit(X_train, y_train_mrs90)
print("Best hyperparameters for Random Forest on MRS90:", grid_search_rf_mrs90.best_params_)
y_val_pred_rf_mrs90 = grid_search_rf_mrs90.predict_proba(X_val)[:, 1]
auc_val_rf_mrs90 = roc_auc_score(y_val_mrs90, y_val_pred_rf_mrs90)
print(f'Random Forest Validation AUC for MRS90: {auc_val_rf_mrs90}')
y_test_pred_rf_mrs90 = grid_search_rf_mrs90.predict_proba(X_test)[:, 1]
auc_test_rf_mrs90 = roc_auc_score(y_test_mrs90, y_test_pred_rf_mrs90)
print(f'Random Forest Test AUC for MRS90: {auc_test_rf_mrs90}')

Best hyperparameters for Random Forest on MRS90: {'max_depth': 10, 'n_estimators': 300}
Random Forest Validation AUC for MRS90: 0.9176989648735788
Random Forest Test AUC for MRS90: 0.8519230769230769


In [29]:
# Define the MLP classifier
mlp = MLPClassifier(max_iter=1000)

# Parameter grid for MLP
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Grid search for MLP - MORT90
grid_search_mlp_mort90 = GridSearchCV(mlp, mlp_param_grid, cv=5, scoring='roc_auc')
grid_search_mlp_mort90.fit(X_train, y_train_mort90)
print("Best hyperparameters for MLP on MORT90:", grid_search_mlp_mort90.best_params_)
y_val_pred_mlp_mort90 = grid_search_mlp_mort90.predict_proba(X_val)[:, 1]
auc_val_mlp_mort90 = roc_auc_score(y_val_mort90, y_val_pred_mlp_mort90)
print(f'MLP Validation AUC for MORT90: {auc_val_mlp_mort90}')
y_test_pred_mlp_mort90 = grid_search_mlp_mort90.predict_proba(X_test)[:, 1]
auc_test_mlp_mort90 = roc_auc_score(y_test_mort90, y_test_pred_mlp_mort90)
print(f'MLP Test AUC for MORT90: {auc_test_mlp_mort90}')

# Grid search for MLP - MRS90
grid_search_mlp_mrs90 = GridSearchCV(mlp, mlp_param_grid, cv=5, scoring='roc_auc')
grid_search_mlp_mrs90.fit(X_train, y_train_mrs90)
print("Best hyperparameters for MLP on MRS90:", grid_search_mlp_mrs90.best_params_)
y_val_pred_mlp_mrs90 = grid_search_mlp_mrs90.predict_proba(X_val)[:, 1]
auc_val_mlp_mrs90 = roc_auc_score(y_val_mrs90, y_val_pred_mlp_mrs90)
print(f'MLP Validation AUC for MRS90: {auc_val_mlp_mrs90}')
y_test_pred_mlp_mrs90 = grid_search_mlp_mrs90.predict_proba(X_test)[:, 1]
auc_test_mlp_mrs90 = roc_auc_score(y_test_mrs90, y_test_pred_mlp_mrs90)
print(f'MLP Test AUC for MRS90: {auc_test_mlp_mrs90}')

Best hyperparameters for MLP on MORT90: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'adam'}
MLP Validation AUC for MORT90: 0.8830049261083744
MLP Test AUC for MORT90: 0.8336940836940838
Best hyperparameters for MLP on MRS90: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive', 'solver': 'sgd'}
MLP Validation AUC for MRS90: 0.8910571864924487
MLP Test AUC for MRS90: 0.8479967948717948


In [30]:
best_lr = grid_search_lr.best_estimator_
best_xgb = grid_search_xgb_mort90.best_estimator_
best_rf = grid_search_rf_mort90.best_estimator_
best_mlp = grid_search_mlp_mort90.best_estimator_


# Predict probabilities for the test set
lr_probs = best_lr.predict_proba(X_test)[:, 1]
xgb_probs = best_xgb.predict_proba(X_test)[:, 1]
rf_probs = best_rf.predict_proba(X_test)[:, 1]
mlp_probs = best_mlp.predict_proba(X_test)[:, 1]

# Average the probabilities
ensemble_probs = (lr_probs + xgb_probs + rf_probs + mlp_probs) / 4

# Evaluate the ensemble
ensemble_auc = roc_auc_score(y_test_mort90, ensemble_probs)
print(f'Ensemble Test AUC: {ensemble_auc}')



Ensemble Test AUC: 0.8834776334776334


### stacking classifier
Uses a final estimator to blend the outputs of the base estimators. Here’s an example using a logistic regression as the final estimator:

In [31]:
from sklearn.ensemble import StackingClassifier

# Define the stacking ensemble
stack = StackingClassifier(
    estimators=[
        ('lr', best_lr),
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('mlp', best_mlp)
    ],
    final_estimator=LogisticRegression()
)

# Fit the stack model
stack.fit(X_train, y_train_mort90)

# Evaluate the stack model on the test set
stack_probs = stack.predict_proba(X_test)[:, 1]
stack_auc = roc_auc_score(y_test_mort90, stack_probs)
print(f'Stacking Ensemble Test AUC: {stack_auc}')


Stacking Ensemble Test AUC: 0.8865440115440116


In [33]:
# Save Logistic Regression Model
dump(grid_search_lr.best_estimator_, 'best_lr_mort90.joblib')

# Save XGBoost Model
dump(grid_search_xgb_mort90.best_estimator_, 'best_xgb_mort90.joblib')

# Save Random Forest Model
dump(grid_search_rf_mort90.best_estimator_, 'best_rf_mort90.joblib')

# Save MLP Model
dump(grid_search_mlp_mort90.best_estimator_, 'best_mlp_mort90.joblib')

# If using a stacking classifier
dump(stack, 'stacking_model_mort90.joblib')


['stacking_model_mort90.joblib']

In [34]:
# Load Logistic Regression Model
best_lr = load('best_lr_mort90.joblib')

# Load XGBoost Model
best_xgb = load('best_xgb_mort90.joblib')

# Load Random Forest Model
best_rf = load('best_rf_mort90.joblib')

# Load MLP Model
best_mlp = load('best_mlp_mort90.joblib')

# Load Stacking Classifier
stack = load('stacking_model_mort90.joblib')


In [37]:
# Example of using a loaded model to predict
predictions = best_lr.predict(X_test)  # Replace X_new with your new data


In [39]:
best_lr_mrs90 = grid_search_lr_mrs90.best_estimator_
best_xgb_mrs90 = grid_search_xgb_mrs90.best_estimator_
best_rf_mrs90 = grid_search_rf_mrs90.best_estimator_
best_mlp_mrs90 = grid_search_mlp_mrs90.best_estimator_


In [40]:
# Predict probabilities for the validation set
lr_probs_val = best_lr_mrs90.predict_proba(X_val)[:, 1]
xgb_probs_val = best_xgb_mrs90.predict_proba(X_val)[:, 1]
rf_probs_val = best_rf_mrs90.predict_proba(X_val)[:, 1]
mlp_probs_val = best_mlp_mrs90.predict_proba(X_val)[:, 1]

# Average the probabilities
ensemble_probs_val = (lr_probs_val + xgb_probs_val + rf_probs_val + mlp_probs_val) / 4

# Evaluate the ensemble on the validation set
ensemble_auc_val = roc_auc_score(y_val_mrs90, ensemble_probs_val)
print(f'Ensemble Validation AUC for MRS90: {ensemble_auc_val}')

# Predict probabilities for the test set
lr_probs_test = best_lr_mrs90.predict_proba(X_test)[:, 1]
xgb_probs_test = best_xgb_mrs90.predict_proba(X_test)[:, 1]
rf_probs_test = best_rf_mrs90.predict_proba(X_test)[:, 1]
mlp_probs_test = best_mlp_mrs90.predict_proba(X_test)[:, 1]

# Average the probabilities
ensemble_probs_test = (lr_probs_test + xgb_probs_test + rf_probs_test + mlp_probs_test) / 4

# Evaluate the ensemble on the test set
ensemble_auc_test = roc_auc_score(y_test_mrs90, ensemble_probs_test)
print(f'Ensemble Test AUC for MRS90: {ensemble_auc_test}')


Ensemble Validation AUC for MRS90: 0.920159511284575
Ensemble Test AUC for MRS90: 0.8610576923076924


In [41]:

# Define the stacking ensemble
stack_mrs90 = StackingClassifier(
    estimators=[
        ('lr', best_lr_mrs90),
        ('xgb', best_xgb_mrs90),
        ('rf', best_rf_mrs90),
        ('mlp', best_mlp_mrs90)
    ],
    final_estimator=LogisticRegression()
)

# Fit the stack model on the training data
stack_mrs90.fit(X_train, y_train_mrs90)

# Evaluate the stack model on the validation set
stack_probs_val = stack_mrs90.predict_proba(X_val)[:, 1]
stack_auc_val = roc_auc_score(y_val_mrs90, stack_probs_val)
print(f'Stacking Ensemble Validation AUC for MRS90: {stack_auc_val}')

# Evaluate the stack model on the test set
stack_probs_test = stack_mrs90.predict_proba(X_test)[:, 1]
stack_auc_test = roc_auc_score(y_test_mrs90, stack_probs_test)
print(f'Stacking Ensemble Test AUC for MRS90: {stack_auc_test}')


Stacking Ensemble Validation AUC for MRS90: 0.9221109791277788
Stacking Ensemble Test AUC for MRS90: 0.8592147435897436


In [42]:
# Save Logistic Regression Model for MRS90
dump(best_lr_mrs90, 'best_lr_mrs90.joblib')

# Save XGBoost Model for MRS90
dump(best_xgb_mrs90, 'best_xgb_mrs90.joblib')

# Save Random Forest Model for MRS90
dump(best_rf_mrs90, 'best_rf_mrs90.joblib')

# Save MLP Model for MRS90
dump(best_mlp_mrs90, 'best_mlp_mrs90.joblib')

# Save the stacking ensemble for MRS90, if you've created one
dump(stack_mrs90, 'stacking_model_mrs90.joblib')

['stacking_model_mrs90.joblib']

In [43]:
# Load Logistic Regression Model for MRS90
best_lr_mrs90 = load('best_lr_mrs90.joblib')

# Load XGBoost Model for MRS90
best_xgb_mrs90 = load('best_xgb_mrs90.joblib')

# Load Random Forest Model for MRS90
best_rf_mrs90 = load('best_rf_mrs90.joblib')

# Load MLP Model for MRS90
best_mlp_mrs90 = load('best_mlp_mrs90.joblib')

# Load Stacking Model for MRS90
stack_mrs90 = load('stacking_model_mrs90.joblib')


In [48]:
# Example: Use the loaded Random Forest model to predict new data
new_data_predictions = best_rf_mrs90.predict(X_test)  # Replace 'new_data' with actual data
new_data_predictions2 = stack_mrs90.predict(X_test)  
