The purpose here is to 
* extract performance metrics, ROC curves, precision-recall curves, f1-proba, prec-proba, spec-proba, recall-proba curves
* extract calibration curves
* extract calibration-corrections

Given: a `.parquet` with the probas for all classes, the fold numbers, the id's and the actual class values

```
@inproceedings{guo2017calibration,
  title={On calibration of modern neural networks},
  author={Guo, Chuan and Pleiss, Geoff and Sun, Yu and Weinberger, Kilian Q},
  booktitle={International conference on machine learning},
  pages={1321--1330},
  year={2017},
  organization={PMLR}
}
```

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm
from collections import defaultdict
from typing import Literal

from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils


In [None]:
plt.style.use('ggplot')

In [None]:
start_path = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data'

In [None]:
os.chdir(start_path)

In [31]:
RUN_CALIBRATION_PLOTS = False
RUN_NBA = False
RUN_ROC = False
RUN_PRC = True
RUN_F1C = True
RUN_RC = True
RUN_PC = True
RUN_NPVC = True
ONLY_DATA = True

In [None]:
RESULTS = {}
PATHS = {}

## Calibration 

In [None]:
output_path = r"G_Output\2_Data\CustomTree\conduction"
df = pd.read_parquet(os.path.join(output_path, "CONDUCTION_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['CONDUCTION'] = df.copy()
PATHS['CONDUCTION'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['CONDUCTION'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['CONDUCTION'], 'Calibrated_RESULTS.parquet'))
RESULTS['CONDUCTION_CALIBRATED'] = cal_df 

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')


In [None]:
output_path = r"G_Output\2_Data\CustomTree\muscle"
df = pd.read_parquet(os.path.join(output_path, "MUSCLE_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['MUSCLE'] = df.copy()
PATHS['MUSCLE'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['MUSCLE'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['MUSCLE'], 'Calibrated_RESULTS.parquet'))
RESULTS['MUSCLE_CALIBRATED'] = cal_df 

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')

In [None]:
output_path = r"G_Output\2_Data\CustomTree\axis"
df = pd.read_parquet(os.path.join(output_path, "AXIS_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['AXIS'] = df.copy()
PATHS['AXIS'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['AXIS'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['AXIS'], 'Calibrated_RESULTS.parquet'))
RESULTS['AXIS_CALIBRATED'] = cal_df 

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')

In [None]:
output_path = r"G_Output\2_Data\CustomTree\conduction_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['CONDUCTION_BINARY'] = df.copy()
PATHS['CONDUCTION_BINARY'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['CONDUCTION_BINARY'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['CONDUCTION_BINARY'], 'Calibrated_RESULTS.parquet'))
RESULTS['CONDUCTION_BINARY_CALIBRATED'] = cal_df   

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')

In [None]:
output_path = r"G_Output\2_Data\CustomTree\muscle_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['MUSCLE_BINARY'] = df.copy()
PATHS['MUSCLE_BINARY'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['MUSCLE_BINARY'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['MUSCLE_BINARY'], 'Calibrated_RESULTS.parquet'))
RESULTS['MUSCLE_BINARY_CALIBRATED'] = cal_df   

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')

In [None]:
output_path = r"G_Output\2_Data\CustomTree\axis_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['AXIS_BINARY'] = df.copy()
PATHS['AXIS_BINARY'] = output_path

cal_df, _ = tree_utils.add_calibrated_values(RESULTS['AXIS_BINARY'].copy(), how='sigmoid')
cal_df.to_parquet(os.path.join(start_path, PATHS['AXIS_BINARY'], 'Calibrated_RESULTS.parquet'))
RESULTS['AXIS_BINARY_CALIBRATED'] = cal_df   

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)
    tree_utils.create_calibration_plots(cal_df, output_path=os.path.join(start_path, output_path), write_out=True, suffix='_calibrated_mean')

## Net benefit analysis

In [None]:
if RUN_NBA:
    for k in RESULTS.keys():
        print(f"Processing {k}...")
        if 'CALIBRATED' not in k:
            tree_utils.net_benefit_curve_plot(RESULTS[k], 
                                              output_path=os.path.join(start_path, PATHS[k]), 
                                              true_col_prefix='Y_true',
                                              pred_col_prefix='Y_pred',
                                              xlim=[0, 0.5],
                                              ylim=[-0.1, 0.5],
                                              show_plot=False,
                                              plot_title="Binary model, not calibrated",
                                              file_suffix="_train",
                                              dataset="train"
                                              )
            tree_utils.net_benefit_curve_plot(RESULTS[k], 
                                              output_path=os.path.join(start_path, PATHS[k]), 
                                              true_col_prefix='Y_true',
                                              pred_col_prefix='Y_pred',
                                              xlim=[0, 0.5],
                                              ylim=[-0.1, 0.5],
                                              show_plot=False,
                                              plot_title="Binary model, not calibrated",
                                              file_suffix="_calibrated",
                                              dataset="test",
                                              )
            tree_utils.net_benefit_curve_plot(RESULTS[f'{k}_CALIBRATED'],
                                                output_path = os.path.join(start_path, PATHS[k]),
                                                true_col_prefix = 'Y_true',
                                                pred_col_prefix = 'Y_pred',
                                                xlim = [0, 0.5],
                                                ylim = [-0.1, 0.5],
                                                show_plot = False,
                                                plot_title = "Binary model, calibrated",
                                                file_suffix = "_calibrated",
                                                dataset = "test",
                                                calibrated = True,                                    
                                              )

## Performance

* ROC
* precision-recall 

In [None]:
if RUN_ROC:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_roc_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k)
            tree_utils.make_roc_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean')

In [None]:
if RUN_PRC:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_precisionRecall_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k)
            tree_utils.make_precisionRecall_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean')

In [32]:
if RUN_F1C:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_f1_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k, only_data=ONLY_DATA)
            tree_utils.make_f1_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean', only_data=ONLY_DATA)

In [33]:
if RUN_RC:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_recall_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k, only_data=ONLY_DATA)
            tree_utils.make_recall_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean', only_data=ONLY_DATA)

In [34]:
if RUN_PC:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_precision_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k, only_data=ONLY_DATA)
            tree_utils.make_precision_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean', only_data=ONLY_DATA)

In [35]:
if RUN_NPVC:
    for k in RESULTS.keys():
        if 'CALIBRATED' not in k:
            tree_utils.make_npv_plots(RESULTS[k], OutPath=os.path.join(start_path, PATHS[k]), Target=k, only_data=ONLY_DATA)
            tree_utils.make_npv_plots(RESULTS[f'{k}_CALIBRATED'], OutPath=os.path.join(start_path, PATHS[k]), Target=k, suffix='_calibrated_mean', only_data=ONLY_DATA)