In [None]:
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
from sklearn.isotonic import IsotonicRegression as IR
from sklearn.linear_model import LogisticRegression as LR
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def isotonic(preds, labels, test_preds):
    preds = np.array(preds)
    labels = np.array(labels)
    test_preds = np.array(test_preds)

    ir = IR(out_of_bounds='clip')
    ir.fit( preds, labels )

    p_calibrated_v = ir.transform( preds )
    p_calibrated_t = ir.transform( test_preds )   # or ir.fit( p_test ), that's the same thing

    return p_calibrated_v, p_calibrated_t

In [None]:
import heapq
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score

def Select_Threshold(df):
    full_threshold_list = []
    for threshold in np.arange(0,1.05,0.05):
        df.drop(columns = ['y_pred'])
        df['y_pred'] = df['calibrated_prediction'].apply(lambda x: 1 if x >= threshold else 0)
        
        y_pred = df["y_pred"].values
        y_true = df["y_true"].values
        
        f1_C1 = f1_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

        
        full_threshold_list.append([threshold, f1_C1, balanced_accuracy])
        
    df_varying_threshold = pd.DataFrame(full_threshold_list, columns = ['threshold', 'f1_score', 'balanced_accuracy'])
    
    # select three highest F1 score and the the highest balanced accuracy
    f1_scores = df_varying_threshold["f1_score"].values
    thresholds = df_varying_threshold["threshold"].values
    bal_acc_values = list(df_varying_threshold["balanced_accuracy"].values)
    
    #print(heapq.nlargest(3, f1_scores))
    list_index = heapq.nlargest(3, range(len(f1_scores)), key=f1_scores.__getitem__)
    opt_threshold = thresholds[bal_acc_values.index(max(bal_acc_values[list_index[0]], bal_acc_values[list_index[1]], bal_acc_values[list_index[2]]))]
    
    
    
    return opt_threshold, df_varying_threshold  

In [None]:
def generate_calibrated_preds(model_path, file, trial):
    df_val = pd.read_csv(model_path + "/val/" + file)
    df_test = pd.read_csv(model_path + "/test/" + file)

    # calibration 
    isotonic_calibrated = isotonic(df_val['prediction'], df_val['y_true'], df_test['prediction'])

    # calibrated validation
    df_val['calibrated_prediction'] = isotonic_calibrated[0]
    df_val['y_pred'] = (isotonic_calibrated[0] >= 0.5).astype(np.int)  # just to put for formatting. No impact
    
    opt_threshold, df_opt_threshold = Select_Threshold(df_val)
    print(opt_threshold)
    df_val.drop(columns = ['y_pred'])
    
    df_val.to_csv(model_path + '/val/' + 'val_isotonic_' + str(trial) +'.csv', index=False)
    # used during subgroup_threshold
    #df_val.to_csv(model_path + '/val/' + 'val_isotonic_' + file +'.csv', index=False)
    df_opt_threshold.to_csv(model_path + '/val/' + 'Threshold_F1_BalAcc_' + str(trial) +'.csv', index=False)
    # used during subgroup threshold
    #df_opt_threshold.to_csv(model_path + '/val/' + 'Threshold_F1_BalAcc_' + file +'.csv', index=False)

    # calibrated test
    df_test['calibrated_prediction'] = isotonic_calibrated[1]
    df_test['y_pred'] = (isotonic_calibrated[1] >= opt_threshold).astype(np.int)
    df_test.to_csv(model_path + '/test/' + 'test_isotonic_' + str(trial) +'.csv', index=False)
    # used during subgroup threshold
    #df_test.to_csv(model_path + '/test/' + 'test_isotonic_' + file +'.csv', index=False)

In [None]:
from os import listdir
from os.path import isfile, join
import fnmatch

# Put the test and validation csv files in csv_files/MODEL_NAME/test/ and csv_files/MODEL_NAME/val respectively
# *************** change model name here ******************** 
model_path = r"csv_files\original"
mypath_val = join(model_path, "val")

# take the csv files in that directory
#onlyfiles = [f for f in listdir(mypath_val) if isfile(join(mypath_val, f))]
onlyfiles = [f for f in listdir(mypath_val) if fnmatch.fnmatch(f, '*.csv')]
print(onlyfiles)

# for all csv files, compute calibration
for trial in range(len(onlyfiles)):
    generate_calibrated_preds(model_path, onlyfiles[trial], trial)
