In [None]:
from azureml.core import Workspace, Dataset, Datastore
subscription_id = 'f8c5aac3-29fc-4387-858a-1f61722fb57a'
resource_group = 'forskerpl-n0ybkr-rg'
workspace_name = 'forskerpl-n0ybkr-mlw'
  
workspace = Workspace(subscription_id, resource_group, workspace_name)
  

import os
from os.path import join

import numpy as np

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import scipy
import pandas as pd

import torch

In [None]:
datastore = Datastore.get(workspace, "workspaceblobstore")
dataset = Dataset.File.from_files(path=(datastore, 'XUE'))
mounted_path = dataset.mount()
mounted_path.start()
mpath = mounted_path.mount_point

In [None]:
# load data from fold 0 to 4
y_true = np.load(join(mpath, 'models/finetune_ANTI_PCSK9_censored_7_days_cv_5folds_all_data/fold_0/checkpoints/targets_test_999.npz'))
y_pred = np.load(join(mpath, 'models/finetune_ANTI_PCSK9_censored_7_days_cv_5folds_all_data/fold_0/checkpoints/probas_test_999.npz'))

y_true = y_true['targets']
y_pred = y_pred['probas']

for i in range(1,5):
    y_true = np.concatenate((y_true, np.load(join(mpath, f'models/finetune_ANTI_PCSK9_censored_7_days_cv_5folds_all_data/fold_{i}/checkpoints/targets_test_999.npz'))['targets']))
    y_pred = np.concatenate((y_pred, np.load(join(mpath, f'models/finetune_ANTI_PCSK9_censored_7_days_cv_5folds_all_data/fold_{i}/checkpoints/probas_test_999.npz'))['probas']))

In [None]:
# distribution of the predicted probabilities (log count on y axis) and color positives and negatives
def plot_probabilities(y_true, y_pred, title):
    plt.figure()
    plt.hist(y_pred[y_true == 1], bins=100, color='r', alpha=0.5, label='positives', log=True)
    plt.hist(y_pred[y_true == 0], bins=100, color='b', alpha=0.5, label='negatives', log=True)
    plt.legend()
    plt.title(title)
    # add perfect calibrated line
    
    # add x and y labels
    plt.xlabel('predicted probability')
    plt.ylabel('log count')
    plt.show()

In [None]:
def plot_ps_positive_negatives(ax, probas, targets, title, bins=20, bin_edges=None):
    if bin_edges is None:
        bin_edges = np.histogram_bin_edges(probas, bins=bins)  
    ax.hist(probas[targets == 0], bins=bin_edges, alpha=0.5, label='Negative', color='b')
    ax.hist(probas[targets == 1], bins=bin_edges, alpha=0.5, label='Positive', color='r')
    ax.set_yscale('log')
    ax.set_title(title)

In [None]:
# use y_true and y_pred to plot calibration curve
from sklearn.calibration import calibration_curve
def plot_calibration_curve(y_true, y_pred, title):
    plt.figure()
    fraction_of_positives, mean_predicted_value = calibration_curve(y_true, y_pred, n_bins=10)
    #plt.plot(mean_predicted_value, fraction_of_positives, 's-')
    # bar chart
    plt.bar(mean_predicted_value, fraction_of_positives, width=0.1)
    plt.plot([0, 1], [0, 1], '--', color='gray')
    plt.xlabel('predicted probability')
    plt.ylabel('fraction of positives')
    plt.title(title)
    plt.show()


In [None]:
# Use platt scaling to calibrate the predicted probabilities
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def platt_scaling(y_true, y_pred):
    lr = LogisticRegression()
    lr.fit(y_pred.reshape(-1, 1), y_true)
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(y_pred, y_true, test_size=0.5, random_state=42)
    clf = CalibratedClassifierCV(base_estimator=lr, method='sigmoid', cv='prefit')
    clf.fit(X_train.reshape(-1, 1), y_train)
    y_pred_calibrated = clf.predict_proba(X_test.reshape(-1, 1))[:, 1]
    return y_pred_calibrated, y_test


In [None]:
# plot new calibration curve after platt scaling
plot_calibration_curve(y_true, y_pred, 'Before Platt Scaling')
y_pred_calibrated = platt_scaling(y_true, y_pred)
plot_calibration_curve(y_true, y_pred_calibrated, 'After Platt Scaling')

# plot the distribution of the predicted probabilities before and after platt scaling
plot_probabilities(y_true, y_pred, 'Before Platt Scaling')
plot_probabilities(y_true, y_pred_calibrated, 'After Platt Scaling')


In [None]:
datastore = Datastore.get(workspace, "workspaceartifactstore")
dataset = Dataset.File.from_files(path=(datastore, 'ExperimentRun'))
mounted_path = dataset.mount()
mounted_path.start()
mpath = mounted_path.mount_point

In [None]:
# load data from fold 0 to 4
y_true = np.load(join(mpath, 'dcid.finetune_10_fold_1716159921_f84d2a6f/outputs/retry_001/finetune_ANTI_PCSK9_censored_7_days_pre_ANTI_PCSK9_ft_5_folds_intraining_calibration/fold_0/checkpoints/targets_test_999.npz'))
y_pred = np.load(join(mpath, 'dcid.finetune_10_fold_1716159921_f84d2a6f/outputs/retry_001/finetune_ANTI_PCSK9_censored_7_days_pre_ANTI_PCSK9_ft_5_folds_intraining_calibration/fold_0/checkpoints/probas_test_999.npz'))

y_true = y_true['targets']
y_pred = y_pred['probas']

In [None]:
# plot
plot_calibration_curve(y_true, y_pred, 'Model with focal loss')
plot_probabilities(y_true, y_pred, 'Model with focal loss')


In [None]:
pids_pretrain = torch.load(join(mpath, "features/med_diag_03/tokenized_fv_sep_pt_1_1M/pids_pretrain.pt"))
pids_finetune = torch.load(join(mpath, "features/med_diag_03/tokenized_fv_sep_pt_1_1M/pids_finetune.pt"))
outcomes = torch.load(join(mpath, 'outcomes/CHRONIC_DISEASES/CHRONIC_DISEASES.pt'))
print(set(pids_finetune).issubset(set(outcomes['PID'])))
# outcomes dict keys: dict_keys(['PID', 'OSTEONECROSIS', 'DIABETES', 'ARRHYTHMIA'])
diabetes_outcomes = {"PID": outcomes['PID'], 'DIABETES': outcomes['DIABETES']}
outcomes_df = pd.DataFrame(diabetes_outcomes)
# get only finetune pids
outcomes_df = outcomes_df[outcomes_df['PID'].isin(pids_finetune)]
# get pos and neg pids
pos_pids = outcomes_df.dropna(subset=['DIABETES'])['PID']
neg_pids = outcomes_df[outcomes_df['DIABETES'].isna()]['PID']
# check if len(pos_pids) + len(neg_pids) == len(pids_finetune)
print(len(pos_pids) + len(neg_pids) == len(pids_finetune))



In [None]:
# split 10 folds of positive and negative pids
positive_pids_finetune = list(pos_pids)
negative_pids_finetune = list(neg_pids)
np.random.seed(42)
np.random.shuffle(positive_pids_finetune)
np.random.shuffle(negative_pids_finetune)
positive_pids_finetune = np.array_split(positive_pids_finetune, 10)
negative_pids_finetune = np.array_split(negative_pids_finetune, 10)
# save the pids into diabetes_split
for i in range(10):
    fold_pids = positive_pids_finetune[i].tolist() + negative_pids_finetune[i].tolist()
    np.random.shuffle(fold_pids)
    torch.save(fold_pids, 'DIABETES_10_folds/fold_{}.pt'.format(i))




In [None]:
# use fold_0 - fold_9 as train/val/test/calibrate in turn
# train: 0 1 2 3 / 2 3 4 5 / 4 5 6 7 / 6 7 8 9 / 8 9 0 1
# val: 4 / 6 / 8 / 0 / 2
# test: 5 6 / 7 8 / 9 0 / 1 2 / 3 4
# calibrate: 7 8 9 / 9 0 1 / 1 2 3 / 3 4 5 / 5 6 7
# save pids for each fold

# get all pids
pids = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
for i in range(1, 10):
    # AttributeError: 'numpy.ndarray' object has no attribute 'extend'
    # pids.extend(torch.load(f'10_fold/fold_{i}.pt'))
    # do not use extend
    pids = np.concatenate((pids, torch.load(f'../DIABETES_10_folds/10_folds/fold_{i}.pt')))
torch.save(pids, '../DIABETES_10_folds/10_folds/pids.pt')
torch.save(pids, '../DIABETES_10_folds/post_hoc_splits/pids.pt')
torch.save(pids, '../DIABETES_10_folds/intraining_splits/pids.pt')

# fold_5
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')))
pids_calibrate = torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')))
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/post_hoc_splits/fold_5/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/post_hoc_splits/fold_5/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/post_hoc_splits/fold_5/pids_test.pt')
torch.save(pids_calibrate, '../DIABETES_10_folds/post_hoc_splits/fold_5/pids_calibrate.pt')

# fold_1
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')))
pids_calibrate = torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')))
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/post_hoc_splits/fold_1/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/post_hoc_splits/fold_1/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/post_hoc_splits/fold_1/pids_test.pt')
torch.save(pids_calibrate, '../DIABETES_10_folds/post_hoc_splits/fold_1/pids_calibrate.pt')

# fold_2
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')))
pids_calibrate = torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')))
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/post_hoc_splits/fold_2/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/post_hoc_splits/fold_2/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/post_hoc_splits/fold_2/pids_test.pt')
torch.save(pids_calibrate, '../DIABETES_10_folds/post_hoc_splits/fold_2/pids_calibrate.pt')

# fold_3
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')))
pids_calibrate = torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')))
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/post_hoc_splits/fold_3/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/post_hoc_splits/fold_3/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/post_hoc_splits/fold_3/pids_test.pt')
torch.save(pids_calibrate, '../DIABETES_10_folds/post_hoc_splits/fold_3/pids_calibrate.pt')

# fold_4
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')
pids_test  = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')))
pids_calibrate = torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')))
pids_calibrate = np.concatenate((pids_calibrate, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/post_hoc_splits/fold_4/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/post_hoc_splits/fold_4/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/post_hoc_splits/fold_4/pids_test.pt')
torch.save(pids_calibrate, '../DIABETES_10_folds/post_hoc_splits/fold_4/pids_calibrate.pt')



In [None]:
# use fold_0 - fold_9 as train/val/test in turn
# train: 012345 / 234567 / 456789 / 678901 / 890123
# val: 67 / 89 / 01 / 23 / 45
# test: 89 / 01 / 23 / 45 / 67
# save pids for each fold

# fold_5
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')
pids_val = np.concatenate((pids_val, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/intraining_splits/fold_5/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/intraining_splits/fold_5/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/intraining_splits/fold_5/pids_test.pt')


# fold_1
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')
pids_val = np.concatenate((pids_val, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/intraining_splits/fold_1/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/intraining_splits/fold_1/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/intraining_splits/fold_1/pids_test.pt')


# fold_2
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')
pids_val = np.concatenate((pids_val, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/intraining_splits/fold_2/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/intraining_splits/fold_2/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/intraining_splits/fold_2/pids_test.pt')

# fold_3
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')
pids_val = np.concatenate((pids_val, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/intraining_splits/fold_3/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/intraining_splits/fold_3/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/intraining_splits/fold_3/pids_test.pt')


# fold_4
pids_train = torch.load('../DIABETES_10_folds/10_folds/fold_8.pt')
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_9.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_0.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_1.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_2.pt')))
pids_train = np.concatenate((pids_train, torch.load('../DIABETES_10_folds/10_folds/fold_3.pt')))
pids_val = torch.load('../DIABETES_10_folds/10_folds/fold_4.pt')
pids_val = np.concatenate((pids_val, torch.load('../DIABETES_10_folds/10_folds/fold_5.pt')))
pids_test = torch.load('../DIABETES_10_folds/10_folds/fold_6.pt')
pids_test = np.concatenate((pids_test, torch.load('../DIABETES_10_folds/10_folds/fold_7.pt')))
# save
torch.save(pids_train, '../DIABETES_10_folds/intraining_splits/fold_4/pids_train.pt')
torch.save(pids_val, '../DIABETES_10_folds/intraining_splits/fold_4/pids_val.pt')
torch.save(pids_test, '../DIABETES_10_folds/intraining_splits/fold_4/pids_test.pt')
