The purpose here is to 
* extract performance metrics, ROC curves, precision-recall curves, f1-proba, prec-proba, spec-proba, recall-proba curves
* extract calibration curves
* extract calibration-corrections

Given: a `.parquet` with the probas for all classes, the fold numbers, the id's and the actual class values

```
@inproceedings{guo2017calibration,
  title={On calibration of modern neural networks},
  author={Guo, Chuan and Pleiss, Geoff and Sun, Yu and Weinberger, Kilian Q},
  booktitle={International conference on machine learning},
  pages={1321--1330},
  year={2017},
  organization={PMLR}
}
```

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm
from collections import defaultdict
from typing import Literal

from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils


In [None]:
plt.style.use('ggplot')

In [None]:
start_path = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data'

In [None]:
os.chdir(start_path)

In [None]:
RUN_CALIBRATION_PLOTS = False

In [None]:
RESULTS = {}
PATHS = {}

## Calibration 

In [None]:
output_path = r"G_Output\2_Data\CustomTree\conduction"
df = pd.read_parquet(os.path.join(output_path, "CONDUCTION_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['CONDUCTION'] = df.copy()
PATHS['CONDUCTION'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
output_path = r"G_Output\2_Data\CustomTree\muscle"
df = pd.read_parquet(os.path.join(output_path, "MUSCLE_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['MUSCLE'] = df.copy()
PATHS['MUSCLE'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
output_path = r"G_Output\2_Data\CustomTree\axis"
df = pd.read_parquet(os.path.join(output_path, "AXIS_MultiClass_wMorphoMap_RESULTS.parquet"))
RESULTS['AXIS'] = df.copy()
PATHS['AXIS'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
output_path = r"G_Output\2_Data\CustomTree\conduction_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['CONDUCTION_BINARY'] = df.copy()
PATHS['CONDUCTION_BINARY'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
output_path = r"G_Output\2_Data\CustomTree\muscle_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['MUSCLE_BINARY'] = df.copy()
PATHS['MUSCLE_BINARY'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
output_path = r"G_Output\2_Data\CustomTree\axis_BinaryClass"
df = pd.read_parquet(os.path.join(output_path, "results_BinaryClass_wMorphoMap.parquet"))
RESULTS['AXIS_BINARY'] = df.copy()
PATHS['AXIS_BINARY'] = output_path

if RUN_CALIBRATION_PLOTS:
    tree_utils.create_calibration_plots(df, output_path=os.path.join(start_path, output_path), write_out=True)

In [None]:
from sklearn.calibration import IsotonicRegression, _SigmoidCalibration
from sklearn.linear_model import LinearRegression

In [None]:
def calibrater(y_true, y_preds, how: Literal=['isotonic', 'linear', 'sigmoid']):
    if how=='isotonic':
        calibrator = IsotonicRegression(out_of_bounds='clip')
    elif how=='linear':
        calibrator = LinearRegression(positive=True)
    elif how=='sigmoid':
        calibrator = _SigmoidCalibration()
    else:
        raise ValueError("method should be one of isotonic, linear or sigmoid")
        
    calibrator.fit(y_preds,y_true)    
    return calibrator

In [None]:
def add_calibrated_values(df, how='isotonic'):
    CALIBRATOR = defaultdict(lambda: defaultdict(list))

    pred_strings = [c for c in df.columns if c.startswith('Y_pred')]
    Classes = set([s.split("_")[3] for s in pred_strings])
    Models = set([s.split("_")[2] for s in pred_strings])
    Folds = df.Fold.unique().tolist()
    Repeats = df.Repeat.unique().tolist()
    
    for _class in Classes:
        for _mod in Models:
            new_col = f'Y_pred_{_mod}_{_class}_calibrated_mean'
            new_col_std = f'Y_pred_{_mod}_{_class}_calibrated_std'   
            df.loc[new_col] = np.nan  
            df.loc[new_col_std] = np.nan 
            for _Repeat in Repeats:
                tmp_calibrator_list = []
                for _Fold in Folds:       
                    conds = (df.Repeat==_Repeat) & (df.Fold==_Fold) & (df.Dataset=='test')
                    Y_true = df.loc[conds, f'Y_true_{_class}'].values
                    Y_pred = df.loc[conds, f'Y_pred_{_mod}_{_class}'].values
                    
                    Calibration_model = calibrater(Y_true, Y_pred, how=how)
                    
                    tmp_calibrator_list.append(Calibration_model)
                    CALIBRATOR[_class][_mod].append(Calibration_model)
                # now we collect the calibrated probas for all folds based on all the calibrations
                _conds = (df.Repeat==_Repeat)  & (df.Dataset=='test')
                calibrated_list = []
                _y_preds = df.loc[_conds, f'Y_pred_{_mod}_{_class}'].values
                for _Fold, _calibrater in enumerate(tmp_calibrator_list):
                    calibrated_list.append(_calibrater.predict(_y_preds))       
                    
                df.loc[_conds, new_col] = np.mean(calibrated_list, axis=0)
                df.loc[_conds, new_col_std] = np.std(calibrated_list, axis=0)
    df['indices'] = df['indices'].astype(int)
    df['Fold'] = df['Fold'].astype(int)
    df['Repeat'] = df['Repeat'].astype(int)
    return df, CALIBRATOR

In [20]:
out_df, _ = add_calibrated_values(RESULTS['AXIS_BINARY'].copy(), how='sigmoid')
out_df.to_parquet(os.path.join(start_path, PATHS['AXIS_BINARY'], 'Calibrated_RESULTS.parquet'))

ArrowInvalid: ("Could not convert 'Y_pred_XGB_Normal_calibrated_mean' with type str: tried to convert to int64", 'Conversion failed for column None with type object')

## Net benefit analysis

In [None]:
tree_utils.net_benefit_curve_plot(RESULTS['AXIS_BINARY'], 
                                  output_path=os.path.join(start_path, PATHS['AXIS_BINARY']), 
                                  true_col_prefix='Y_true',
                                  pred_col_prefix='Y_pred',
                                  xlim=[0, 0.5],
                                  ylim=[-0.1, 0.3],
                                  plot_title="Binary model, not calibrated",
                                  file_suffix="",
                                  show_plot=False)

## Performance