#### AMI with the new dataset
The pipeline is run with all the columns of the dataset, we are only keeping 10 features selected with the SelectkBest method in order to do the PCA with fewer number of features.

##### First study : supervised prediction
At first with the existing pipeline created during the pancreas cancer study we will try to use supervised learning in order to predict the relapse of AMI.

In [None]:
## Importing the librairies and the already written functions that we will use later on
%run "2 - General\List functions.ipynb"   ## List of general functions that we will need such as visualization, training...

In [None]:
## Importing the data from the company Onedrive
raw_data = pd.read_excel(r'C:/Users/33753/OneDrive - Numa Health/Documents - 8 - Medical & Data Science/6 - Théo/01 - Data/6 - Myocarde/df_myocarde_230420_reduced.xlsx',
                     )
raw_data.set_index('Unnamed: 0', inplace= True) 

# Removing all the last columns corresponding to the outcomes
data = raw_data.loc[:,raw_data.columns[:raw_data.shape[1]-8]]
outcome = 'MACE 2 yr (death, stroke, MI, hositalizacija(stento restenoz, new stenoz, SN))'
data[outcome] = raw_data.loc[:,outcome]

In [None]:
## Renaming the columns with standard names
bio = {'RBC (I) e6': 'RBC',
 'WBC e3 (I)skyriuje' : 'WBC',
 'PMN (%) I' : 'PMN',
 'LYMPH (%)I' : 'LYMPH',
 'Mono (%) I' : 'MONO',
 'EOS (%) I' : 'EOS',
 'BASO (%) I' : 'BASO',
 'PLT.(I)' : 'PLT',
 'Hb mg/dL (I) ' : 'HG',
 'HCT (I)' : 'HCT',
 'MCH (I)' : 'MHC',
 'MCHC (I)' : 'MCHC',
 'K (I)' : 'K',
 'Ca (I)' : 'CA'}

data.rename(columns = bio, inplace = True)

# Dropping the unused endobiogenical indexes
data.drop(columns = ['Unnamed: 0.1','Cortisol INDEX','Mono/Lym','G/T','Adaptation'], inplace=True)

In [None]:
## Checking if each column is in the correct unit to calculate the endobiological indexes based on a function specific for each biomarkers
## If not in the correct unit transforming the data
data = unit_check(data)

In [None]:
## Calculation of the endobiological indexes in their raw form (numerical)
calculation_raw_index(data, list_index = [] , Basophile = 'BASO', Eosinophile = 'EOS', Hemoglobine = 'HG', Leucocyte = 'WBC',
                    Lymphocyte = 'LYMPH', Monocyte = 'MONO', Neutrophile = 'PMN', Platelet = 'PLT', TCMHemog = 'MHC',
                    calcium ='CA', potassium = 'K',hematies='RBC')

In [None]:
## Columns to be removed from the dataset based on Kamyar (Numas's endobiogeny specialist and physician) choice (27/04/2023)
cols_to_remove = ['STEMI/ non STEMI.1' ,'BMI (kg/m2)' ,'Dyslipidemia' ,'Hypertesion' ,'Smoking' ,
        'DM' ,'CABG' ,'History of IHD' ,'History of heart failue' ,
        'previous PCI' ,'Previous MI' ,'Risk AGG' ,'EF post PCI CAT; <=45 = 1' ,
        'Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD' ,'Number of diseased vessel' ,'percentage of culpret vessel; >=50 significant if other physiologic variables are present' ,
        'Coronary dominancy (the artery that feeds the posterior descending artery): 40% RCA, 60% left Circumflex)' ,
        'Cardiac rythm during incharge' ,'in-hospital II - III* AVB' ,'SV during hosp. ' ,'Time until death (1yr)/ Laikas iki mirties (1m)' ,
        'Died in 1 yr / Mire per 1m.' ,'Time until death (2yr)/ Laikas iki mirties (2m)' ,'Na (I)' ,'eGFR 1 (GFG)' ,'Cr 1 (mmol/l)' ,'Cortisol INDEX' ,
        'Death in hospital period; 0=No, 1=Yes' ,'1yr MACE( death. MI. stroke, hospitalisation)' ,'MACE 03.28']


In [None]:
## Creation of new columns based on Kamyar's insight
data['Mono/Lym']=data['MONO']/data['LYMPH']
data['Thyroid relaunching corrected'] = data['Mono/Lym']*data['genit_thyro']

In [None]:
## Definition of the useful features that will be used for the study
feature_biomarkers_4 = ['WBC','HG', 'RBC', 'MHC', 'PMN', 'EOS', 'BASO', 'LYMPH', 'MONO', 'PLT',
                     'eGFR 1 CAT 0 = >90', '?? Troponin [Trop MAX]','All SERUM Cortisol Kortizolis-Serum','CRP']
feature_index_4 = ['IML', 'IMP', 'cortisol_func', 'cortisol_struc', 'consum_aggr_index', 'pro_inflam',
                'adapt_ratio', 'Mono/Lym', 'genit_thyro', 'Thyroid relaunching corrected']
feature_other_4 = list(
    filter(lambda x: x not in feature_biomarkers_4 and x not in feature_index_4, data.columns)
)

In [None]:
## Running the classification pipeline previously created on the outcome : MACE 2yr 
feature_sets = {'feature_biomarkers':feature_biomarkers_4, 
                'feature_index':feature_index_4, 
                'feature_other':feature_other_4}

# Specifying the date to calculate the time taken for the whole process
today = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')

for name, feature_set in feature_sets.items():
    df = data.loc[~data[outcome].isna()].copy()
    for col in cols_to_remove:
        if col in df.columns:
            df.drop(columns = col, inplace=True)
    if 'Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD' in df.columns:
        df.drop(columns = ['Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD'], inplace=True)
    if outcome in feature_set:
        feature_set.remove(outcome)

    # Creating the path where all the results, visualization etc will be stored
    path = r'C:/Users/33753/OneDrive - Numa Health/Documents - 8 - Medical & Data Science/6 - Théo/03 - Research/5 - Pathologies/6-Myocarde/'
    path += today+'/'+'MACE 2yr'+'/'+name+'/'

    pipeline_classification(df, feature_set, col_to_pred=outcome, cutoff=0, thresh=None, save=True, outdir=path, show=False,
                                scoring='accuracy', loss_function=None, dict_models=None, dict_param_models=None, normalization_method=None,
                                imputation_method='iterative', feature_selection_method='Voting', n_features=4, calcul='No')

In [None]:
## Re-Running the classification pipeline with the same outcome but with updating features based on the previous results
final_features = ['PMN','EOS','LYMPH','immed_adapt_score','cortisol_struc','Thyroid relaunching corrected','genit_thyro','IML','IMP','adapt_ratio'
                '6 month Stroke','Killip klasė (CHF)','heart rate and conduction disarrangement during hospitalization']

df = data.loc[~data[outcome].isna()].copy()
if 'Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD' in df.columns:
    df.drop(columns = ['Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD'], inplace=True)
    
# Creating the path where all the results, visualization etc will be stored
path = r'C:/Users/33753/OneDrive - Numa Health/Documents - 8 - Medical & Data Science/6 - Théo/03 - Research/5 - Pathologies/6-Myocarde/'
path += today+'/'+'MACE 2yr'+'/'+'final_features'+'/'

pipeline_classification(df, final_features, col_to_pred=outcome, cutoff=0, thresh=None, save=True, outdir=path, show=False,
                            scoring='accuracy', loss_function=None, dict_models=None, dict_param_models=None, normalization_method=None,
                            imputation_method='iterative', feature_selection_method='Voting', n_features=6, calcul='No')

##### Second study : AMI with double model and PCA
As the fist study wasn't probing we tried a new method with unsupervised learning

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
## Importing the librairies and the already written functions that we will use later on
%run "C:\Users\33753\Documents\Machine-Learning\1-Functions\List_Functions.ipynb" ## List of general functions that we will need such as visualization, training...

In [None]:
# Specifying the path to save all the plot, datasets etc
path = r'C:/Users/33753/OneDrive - Numa Health/Documents - 8 - Medical & Data Science/6 - Théo/03 - Research/05 - Pathologies/06-Myocarde/'
path += today+'/'+'Test PCA'+'/'

In [None]:
# Loading the Myocarde dataset                  
data = pd.read_excel(r'C:/Users/33753/OneDrive - Numa Health/Documents - 8 - Medical & Data Science/6 - Théo/01 - Data/6 - Myocarde/df_myocarde_230420_reduced.xlsx',
                     )
data.set_index('Unnamed: 0', inplace= True)

In [None]:
# Specifying the name of all the useful features and the outcome
AMI_features = ['RBC (I) e6', 'WBC e3 (I)skyriuje', 'PMN (%) I',
       'LYMPH (%)I', 'Mono (%) I', 'EOS (%) I', 'Cortisol INDEX', 'Mono/Lym',
       'G/T', 'Adaptation', 'BASO (%) I', 'PLT.(I)', 'Hb mg/dL (I) ',
       'MCH (I)', 'Na (I)', '?? Troponin [Trop MAX]',
       'All SERUM Cortisol Kortizolis-Serum', 'FT3 (I)', 'First TnI',
       'Total Cholesterol', 'eGFR 1 (GFG)', 'eGFR 1 CAT 0 = >90',
       'Cr 1 (mmol/l)', 'CRB', 'Hospitalisation Period (days)', 'Age',
       'Age CAT <65 = 0;', 'Sex: 0=MALE, 1=FEMALE', 'STEMI/ non STEMI',
       'STEMI/ non STEMI.1', 'Killip klasė (CHF)', 'BMI (kg/m2)',
       'BMI CAT >30=1', 'Dyslipidemia', 'Hypertesion', 'Smoking', 'DM',
       'Familial IHD', 'CABG', 'History of IHD', 'History of heart failue',
       'previous PCI', 'Previous MI', 'Risk AGG',
       'EF after 24 hr from PCI >=50 NL, <=45 is increased risk of CHF, Mortality',
       'EF post PCI CAT; <=45 = 1',
       'Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD',
       'Number of diseased vessel',
       'percentage of culpret vessel; >=50 significant if other physiologic variables are present',
       'Coronary dominancy (the artery that feeds the posterior descending artery): 40% RCA, 60% left Circumflex)',
       'heart rate and conduction disarrangement during hospitalization',
       'Cardiac rythm during incharge', 'in hospital AFib', 'SV during hosp. ',
       'CRP', '6 month Stroke']
outcome = 'MACE 2 yr (death, stroke, MI, hositalizacija(stento restenoz, new stenoz, SN))'

In [None]:
# A dictionary to rename all the biomarkers to a standard naming
bio = {'RBC (I) e6': 'RBC',
 'WBC e3 (I)skyriuje' : 'WBC',
 'PMN (%) I' : 'PMN',
 'LYMPH (%)I' : 'LYMPH',
 'Mono (%) I' : 'MONO',
 'EOS (%) I' : 'EOS',
 'BASO (%) I' : 'BASO',
 'PLT.(I)' : 'PLT',
 'Hb mg/dL (I) ' : 'HB',
 'HCT (I)' : 'HCT',
 'MCH (I)' : 'MHC',
 'MCHC (I)' : 'MCHC',
 'K (I)' : 'K',
 'Ca (I)' : 'CA'}

data.rename(columns = bio, inplace = True)

In [None]:
# List of the features to be removed according to the insight of Kamyar Hedayat
toBeRemovedKamyar = ['STEMI/ non STEMI.1' ,'BMI (kg/m2)' ,'Dyslipidemia' ,'Hypertesion' ,'Smoking' ,
        'DM' ,'CABG' ,'History of IHD' ,'History of heart failue' ,
        'previous PCI' ,'Previous MI' ,'Risk AGG' ,'EF post PCI CAT; <=45 = 1' ,
        'Culprit vessel: 0=RCA, 1=Rt Circumflex, 2=LAD' ,'Number of diseased vessel' ,'percentage of culpret vessel; >=50 significant if other physiologic variables are present' ,
        'Coronary dominancy (the artery that feeds the posterior descending artery): 40% RCA, 60% left Circumflex)' ,
        'Cardiac rythm during incharge','SV during hosp. ' ,'Time until death (1yr)/ Laikas iki mirties (1m)' ,
        'Died in 1 yr / Mire per 1m.' ,'Time until death (2yr)/ Laikas iki mirties (2m)' ,'Na (I)' ,'eGFR 1 (GFG)' ,'Cr 1 (mmol/l)' ,'Cortisol INDEX' ,
        'Death in hospital period; 0=No, 1=Yes' ,'1yr MACE( death. MI. stroke, hospitalisation)' ,'MACE 03.28']
data.drop(columns=toBeRemovedKamyar, inplace=True)

# List of features to removed to avoid duplicates
data.drop(columns = ['Unnamed: 0.1','Mono/Lym','G/T','Adaptation'], inplace=True)

In [None]:
## Preprocessing of the data
df, new_feature_set = engineering(data, AMI_features, col_to_pred=outcome, cutoff=0, normalization_method=None, imputation_method='iterative',
                                    feature_selection_method = None, n_features=5, calcul='Yes')

In [None]:
# Creating X : the dataset without the expected outcome
X = df.copy()
X.drop(columns=[outcome], inplace=True)
X_index = X.index

# PCA
# Apply PCA to reduce the dimensionality of the data to n_features
pca = PCA()
pca.fit(X)
X = pca.transform(X)

In [None]:
print("pca.explained_variance_ : \n", pca.explained_variance_, '\n')
print("pca.explained_variance_ratio_ :\n", pca.explained_variance_ratio_, '\n')
print("pca.explained_variance_ratio_.cumsum() :\n", pca.explained_variance_ratio_.cumsum(), '\n')
print("pca.noise_variance_ :\n", pca.noise_variance_, '\n')
print("pca.singular_values_ :\n", pca.singular_values_, '\n')

In [None]:
# Plot of the explained variance ratio cumulated with lines to better view the 90% threshold
plt.barh(y = np.arange(1, len(list(pca.explained_variance_ratio_.cumsum()))+1), width = list(pca.explained_variance_ratio_.cumsum()), )
plt.axhline(1, color = 'red')
plt.axvline(0.90, color = 'red')

In [None]:
# Number of facotrial axes to keep
nbFactAxe = len([x for x in list(pca.explained_variance_ratio_.cumsum()) if x < 0.90]) + 1
if nbFactAxe < 5:
    nbFactAxe = 5

In [None]:
# Getting the most important features from the PCA and saving them
mostImportantFeatures = pd.DataFrame()
i=1
index = range(len(pca.components_[0]))
for comp in range(nbFactAxe):
    s = sorted(index, reverse=True, key=lambda i: abs(pca.components_[comp])[i])[:nbFactAxe]
    S = sorted(abs(pca.components_[comp]), reverse=True)[:nbFactAxe]
    mostImportantFeatures[i] = list(df.columns[s])
    i+=1
save_dataframe(outdir = path, dataframe=mostImportantFeatures, file_type='xlsx', name = 'Most important features')

In [None]:
# Getting the most recurrent features
value_counts = mostImportantFeatures.values.flatten().tolist()
count = pd.Series(value_counts).value_counts()
mostReccurentFeatures = pd.DataFrame(count)
save_dataframe(outdir = path, dataframe=mostReccurentFeatures, file_type='xlsx', name = 'Occurences')

The first new variable is explaining more than 90% of the variance of the data which means that the first variable have more than 90% of the information. 

In [None]:
# Let's only keep the 11 first variables
fact_ax = list(np.arange(0,nbFactAxe))
dfReduced = pd.DataFrame(X).loc[:,fact_ax]
dfReduced['index']=X_index
dfReduced = dfReduced.set_index('index')
dfReduced[outcome] = df.loc[:,outcome]

In [None]:
## Visualization of after preprocessing data
vizualisation(dfReduced, fact_ax, outcome, save=True, outdir=path+'After_Preprocessing/', show=True)

In [None]:
# Training
X, y = dfReduced.loc[:,list(dfReduced.columns[:-1])], dfReduced.loc[:,outcome]

# Fitting the Decision Tree
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X,y)

# Fitting the Fandom Forest
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X,y)

# Fitting the XGBoost
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model = xgb_model.fit(X,y)

In [None]:
# Dictionary to store the results
dict_result = {'DecisionTree':{},
               'RandomForest':{},
               'XGBoost':{}}

In [None]:
# Declaring the variable that will store the model trained on the survival population
dt_model_survival = DecisionTreeClassifier()
rf_model_survival = RandomForestClassifier()
xgb_model_survival = XGBClassifier(eval_metric='mlogloss')

In [None]:
# Initialize the StratifiedKFold with 10 splits
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize an empty list to store the residual errors
residual_errors_dt = []
residual_errors_rf = []
residual_errors_xgb = []

# Empty dataframe to store the wrong predictions
dict_error_analysis = {'DecisionTree':[],
               'RandomForest':[],
               'XGBoost':[]}

iter = 1

# Iterate over each fold
for train_index, test_index in skfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Showing the distribution of death and survival in each iteration
    display_distribution_pie(df.iloc[train_index], outcome, save = True,
                             outdir=path+'iter_'+str(iter)+'/', show=False)
    display_distribution_pie(df.iloc[test_index], outcome, save = True,
                             outdir=path+'iter_'+str(iter)+'/', show=False)

    # Initialize and fit the decision tree model
    dt_model_CV = DecisionTreeClassifier()
    rf_model_CV = RandomForestClassifier()
    xgb_model_CV = XGBClassifier(eval_metric='mlogloss')
    
    dt_model_CV = dt_model_CV.fit(X_train, y_train)
    rf_model_CV = rf_model_CV.fit(X_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xgb_model_CV = xgb_model_CV.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_dt = dt_model_CV.predict(X_test)
    y_pred_rf = rf_model_CV.predict(X_test)
    y_pred_xgb = xgb_model_CV.predict(X_test)


    # Training a second model on the survival population to determine if a hidden pattern exists
    X_train_survival = X_test.copy()
    X_train_survival['Outcome'] = y_test

    X_train_survival['PredictedOutcomeDt'] = y_pred_dt
    X_train_survival['PredictedOutcomeRf'] = y_pred_rf
    X_train_survival['PredictedOutcomeXgb'] = y_pred_xgb

    X_train_survival_red = X_train_survival.loc[X_train_survival['Outcome']==0]
    y_test_survival_dt = X_train_survival_red['PredictedOutcomeDt']
    y_test_survival_rf = X_train_survival_red['PredictedOutcomeRf']
    y_test_survival_xgb = X_train_survival_red['PredictedOutcomeXgb']

    dt_model_survival = dt_model_survival.fit(X_train_survival_red.loc[:,fact_ax], y_test_survival_dt)
    rf_model_survival = rf_model_survival.fit(X_train_survival_red.loc[:,fact_ax], y_test_survival_rf)
    xgb_model_survival = xgb_model_survival.fit(X_train_survival_red.loc[:,fact_ax], y_test_survival_xgb)


    # Adding the index of the wrong predictions to the corresponding dataframe
    dict_error_analysis['DecisionTree'] += list(X_train_survival[X_train_survival['Outcome'] != X_train_survival['PredictedOutcomeDt']].index)
    dict_error_analysis['RandomForest'] += list(X_train_survival[X_train_survival['Outcome'] != X_train_survival['PredictedOutcomeRf']].index)
    dict_error_analysis['XGBoost'] += list(X_train_survival[X_train_survival['Outcome'] != X_train_survival['PredictedOutcomeXgb']].index)

    # Calculate the residual error (e.g., mean squared error)
    residual_error_dt = mean_squared_error(y_test, y_pred_dt)
    residual_errors_dt.append(round(residual_error_dt, 4))

    residual_error_rf = mean_squared_error(y_test, y_pred_rf)
    residual_errors_rf.append(round(residual_error_rf,4))

    residual_error_xgb = mean_squared_error(y_test, y_pred_xgb)
    residual_errors_xgb.append(round(residual_error_xgb,4))

    # Plotting the confusion matrix for each model
    metrics_dt = display_confusion_matrix(X_test, y_test, dt_model_CV, name=None, y_pred=y_pred_dt, return_metrics=True, save = True,
                             outdir=path+'iter_'+str(iter)+'/', show=False)
    metrics_rf = display_confusion_matrix(X_test, y_test, rf_model_CV, name=None, y_pred=y_pred_rf, return_metrics=True, save = True,
                             outdir=path+'iter_'+str(iter)+'/', show=False)
    metrics_xgb = display_confusion_matrix(X_test, y_test, y_pred_xgb, name=None, y_pred=y_pred_xgb, return_metrics=True, save = True,
                             outdir=path+'iter_'+str(iter)+'/', show=False)
    
    # Saving the results in the dictionary
    dict_result['DecisionTree'][str(iter)] = {'MSE' : round(residual_error_dt,4),
                                    'SB' : round(metrics_dt[1],4),
                                    'ST' : round(metrics_dt[2],4)}
    dict_result['RandomForest'][str(iter)] = {'MSE' : round(residual_error_rf,4),
                                    'SB' : round(metrics_rf[1],4),
                                    'ST' : round(metrics_rf[2],4)}
    dict_result['XGBoost'][str(iter)] = {'MSE' : round(residual_error_xgb,4),
                                    'SB' : round(metrics_xgb[1],4),
                                    'ST' : round(metrics_xgb[2],4)}

    iter+=1

# Print the residual error for the current fold and the average residual error across all folds
print("Residual error (fold):", residual_errors_dt)
print("Average residual error:", round(sum(residual_errors_dt) / len(residual_errors_dt),4))

# Print the residual error for the current fold and the average residual error across all folds
print("Residual error (fold):", residual_errors_rf)
print("Average residual error:", round(sum(residual_errors_rf) / len(residual_errors_rf),4))

# Print the residual error for the current fold and the average residual error across all folds
print("Residual error (fold):", residual_errors_xgb)
print("Average residual error:", round(sum(residual_errors_xgb) / len(residual_errors_xgb),4))

In [None]:
# Getting the rows that were falsely predicted
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['DecisionTree'],:], file_type = 'xlsx', name='Error_Analysis_CV_dt')
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['RandomForest'],:], file_type = 'xlsx', name='Error_Analysis_CV_rf')
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['XGBoost'],:], file_type = 'xlsx', name='Error_Analysis_CV_xgb')

In [None]:
# Calculate the mean value and the STD of all the metrics
df_AVG = pd.DataFrame()
for typeModel in ['DecisionTree','RandomForest','XGBoost']:
    MSE_list = []
    SB_list=[]
    ST_list=[]
    for value in dict_result[typeModel].values():
        MSE_list.append(value['MSE'])
        SB_list.append(value['SB'])
        ST_list.append(value['ST'])
    df_AVG.loc['AVG',typeModel] = str({'MSE':round(np.mean(MSE_list),4),
                                'SB':round(np.mean(SB_list),4),
                                'ST':round(np.mean(ST_list),4)})
    df_AVG.loc['STD',typeModel] = str({'MSE':round(np.std(MSE_list),4),
                                'SB':round(np.std(SB_list),4),
                                'ST':round(np.std(ST_list),4)})

In [None]:
# Saving the results as a dataframe
df_result = pd.DataFrame(dict_result)
df_result = pd.concat([df_result,df_AVG])

In [None]:
# Initialize the LeaveOneOUt cross validation
skfold = LeaveOneOut()

# Initialize an empty list to store the residual errors
residual_errors_dt = []
residual_errors_rf = []
residual_errors_xgb = []

dict_PredictedOutcome = {'DecisionTree':[],
                        'RandomForest':[],
                        'XGBoost':[]}

pred_test = pd.DataFrame()
dict_error_analysis = {'DecisionTree':[],
               'RandomForest':[],
               'XGBoost':[]}

# Iterate over each fold
for train_index, test_index in skfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and fit the decision tree model
    dt_model_CV = DecisionTreeClassifier()
    rf_model_CV = RandomForestClassifier()
    xgb_model_CV = XGBClassifier(eval_metric='mlogloss')
    
    dt_model_CV = dt_model_CV.fit(X_train, y_train)
    rf_model_CV = rf_model_CV.fit(X_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xgb_model_CV = xgb_model_CV.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_dt = dt_model_CV.predict(X_test)
    dict_PredictedOutcome['DecisionTree'].append(y_pred_dt)
    y_pred_rf = rf_model_CV.predict(X_test)
    dict_PredictedOutcome['RandomForest'].append(y_pred_rf)
    y_pred_xgb = xgb_model_CV.predict(X_test)
    dict_PredictedOutcome['XGBoost'].append(y_pred_xgb)
    pred_test = pd.concat([pred_test,y_test])

    # Calculate the residual error (e.g., mean squared error)
    residual_error_dt = mean_squared_error(y_test, y_pred_dt)
    residual_errors_dt.append(residual_error_dt)

    residual_error_rf = mean_squared_error(y_test, y_pred_rf)
    residual_errors_rf.append(residual_error_rf)

    residual_error_xgb = mean_squared_error(y_test, y_pred_xgb)
    residual_errors_xgb.append(residual_error_xgb)

pred_test.loc[:,'PredictedOutcomeDt'] = [int(arr[0]) for arr in dict_PredictedOutcome['DecisionTree']]
pred_test.loc[:,'PredictedOutcomeRf'] = [int(arr[0]) for arr in dict_PredictedOutcome['RandomForest']]
pred_test.loc[:,'PredictedOutcomeXgb'] = [int(arr[0]) for arr in dict_PredictedOutcome['XGBoost']]

# Adding the index of the wrong predictions to the corresponding dataframe
dict_error_analysis['DecisionTree'] += list(pred_test[pred_test[0] != pred_test['PredictedOutcomeDt']].index)
dict_error_analysis['RandomForest'] += list(pred_test[pred_test[0] != pred_test['PredictedOutcomeRf']].index)
dict_error_analysis['XGBoost'] += list(pred_test[pred_test[0] != pred_test['PredictedOutcomeXgb']].index)

In [None]:
# Getting the rows that were falsely predicted
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['DecisionTree'],:], file_type = 'xlsx', name='Error_Analysis_LOO_dt')
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['RandomForest'],:], file_type = 'xlsx', name='Error_Analysis_LOO_rf')
save_dataframe(outdir=path+'Error analysis/', dataframe=data.loc[dict_error_analysis['XGBoost'],:], file_type = 'xlsx', name='Error_Analysis_LOO_xgb')

In [None]:
## Trying a Leave One Out cross validation process
df_LOO = pd.DataFrame()
for typeModel in ['DecisionTree','RandomForest','XGBoost']:

    # Calculate the confusion matrix for all folds
    cm = confusion_matrix(pred_test[0], dict_PredictedOutcome[typeModel])

    # Calculate sensitivity (recall)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])

    # Calculate specificity
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])

    # Calculate mean squared error
    mse = mean_squared_error(pred_test[0], dict_PredictedOutcome[typeModel])

    df_LOO.loc['LOO',typeModel] = str({'MSE':round(mse,4),
                                     'SB':round(sensitivity,4),
                                     'ST':round(specificity,4)})

    print("Sensitivity:", round(sensitivity,4))
    print("Specificity:", round(specificity,4))
    print("Mean Squared Error:", round(mse,4))

In [None]:
# Saving the dataframe with all the results
df_result = pd.concat([df_result,df_LOO])
df_result = df_result.applymap(lambda x: str(x).replace('{', '').replace('}', ''))
save_dataframe(outdir=path, dataframe=df_result, file_type = 'xlsx', name='results')

END of PCA study