In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools
import copy

### set the seeds and change to current directory + set the output directory

In [None]:
SEED=90210
np.random.seed(SEED)
os.environ['USER_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/'
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

In [None]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import plotCorr, getPred_fromProb, getMetrics, getCorr, getGroundTruth, getURange
from AnalysisFuncs import getResiduals, plotDistributionProbs, plotCorr_w_Unobs, trainHardPseudo
from AnalysisFuncs import saveFile, loadFile, plotCalibrationPlots

In [None]:
%reload_ext autoreload
%autoreload 2

### Create function to pickle functions

In [None]:
processed_data_path=osp.join(os.environ.get('USER_PATH'), 'HIRID_Repo', 'logs', 'benchmark_exp')

In [None]:
train_second_stage = loadFile(osp.join(processed_data_path,  'LGBM_w_feat_v2_cutoff_T', 'secondStage'),
                              'train_second_stage.pkl')
test_second_stage = loadFile(osp.join(processed_data_path,  'LGBM_w_feat_v2_cutoff_T', 'secondStage'), 
                             'test_second_stage.pkl')
test_X = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
                '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_rep.pkl')
calibrated_p_T = loadFile(osp.join(processed_data_path,'probs_T'), '/probs.npy')
calibrated_p_D_T1 = loadFile(osp.join(processed_data_path,'probs_D|T'), '/probs.npy')

In [None]:
# check that these are indeed the calibrated probabilities 
fig1,ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
test_y_T = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
                '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_label.pkl')
plotCalibrationPlots(calibrated_p_T, test_y_T, None, 'T', ax1, ax2, n_bins=10)
fig1.show()
fig2.show()
plt.close()

In [None]:
fig1,ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
test_y_D_given_T = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
            '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Above_Threshold', '1111'), 'test_label.pkl')
plotCalibrationPlots(calibrated_p_D_T1[test_y_T==1], test_y_D_given_T, None, 'D|T', ax1, ax2, n_bins=10)
fig1.show()
fig2.show()
plt.close()

In [None]:
test_ids = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
        '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_patient_ids.pkl')
assert len(np.intersect1d(test_ids[train_second_stage['idxs']], test_ids[test_second_stage['idxs']]))==0
tr=len(train_second_stage['idxs'])
t=len(test_second_stage['idxs'])
print(f"train %:{tr*100/(tr+t):.2f}, test %:{t*100/(tr+t):.2f}")

In [None]:
# load the models
LGBM_T = loadFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 'predict_T'),
                  'LGBM_T.pkl')
LGBM_D_given_T = loadFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage',
                                   'predict_D_given_T'),'LGBM_D_given_T.pkl')
LGBM_D_and_T =loadFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 
                                'predict_D_and_T'),'LGBM_D_and_T.pkl')
LGBM_D_given_T_ipw = loadFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage',
                                       'predict_D_given_T_ipw'),'LGBM_D_given_T_ipw.pkl')
LGBM_D_pseudo = loadFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 
                                  'predict_D_pseudo'),'LGBM_D_pseudo.pkl')

In [None]:
models=['LGBM']
tasks=['T', 'D|T', 'D|T_ipw', 'D_and_T', 'D_pseudo']
clf_list=[LGBM_T, LGBM_D_given_T, LGBM_D_given_T_ipw,
          LGBM_D_and_T, LGBM_D_pseudo]
clf_dict={}
i=0
for t in tasks:
    clf_dict[t]={}
    for m in models:
        clf_dict[t][m]=clf_list[i]
        i+=1
dict_df_labels={}
dict_df_probs={}
dict_df_ids={}
dict_models={}
probs_path=osp.join(processed_data_path, 'secondStage')
alpha=0.1
figsize1=(5,50)
figsize2=(10,10)
df_pp=pd.DataFrame({'AUC' : [],'PR':[],'BalancedAcc':[],'modelName':[],'rowName':[]})
dict_df_labels, dict_df_probs, dict_models, df_pp = getCorr(models, tasks, 
        test_second_stage['X_T'], clf_dict, None, 
        None, dict_df_labels, dict_df_probs, 
        dict_models, df_pp, probs_path, calibrate=False, 
        figsize1=figsize1, figsize2=figsize2, alpha=alpha, test_second_stage=test_second_stage)

In [None]:
title='Medical correlation matrix'
corr_method="spearman"
figsize=(10, 10)
top_adjust=0.9
title_en=False
plotCorr(models, dict_df_probs, title, corr_method, figsize=figsize, top_adjust=top_adjust,
         title_en=title_en, savefig_path=osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage'))

In [None]:
test_X.shape

In [None]:
title=r'Correlation with $p_{Y}$' '\n' r'when $u(x)=\alpha p_{Y_{T=1}}$'
models=['LGBM']
tasks=['T', 'D|T', 'D|T_ipw', 'D_and_T', 'D_pseudo']
#        , 'product_T_D_given_T', 'D_pseudo']
figsize=(7,5)
# figsize=(10,7)
top_adjust=0.9
# tasks=['T', 'D|T']
corr_method=stats.spearmanr
title_en=True
loc='lower right'
legend_ncol=2
plotCorr_w_Unobs(dict_df_probs, models, title, tasks, alpha=np.arange(0,1.1,0.1), corr_method=corr_method,
                 figsize=figsize, top_adjust=top_adjust, title_en=title_en, loc=loc,
        calibrated_p_T=calibrated_p_T[test_second_stage['idxs']],
                 calibrated_p_D_T1=calibrated_p_D_T1[test_second_stage['idxs']], legend_ncol=legend_ncol,
                savefig_path=osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage'))

In [None]:
title=r'Correlation with $p_{Y}$' '\n' r'when $u(x)=\alpha$ is constant'
models=['LGBM']
# tasks=['T', 'D|T']
corr_method=stats.spearmanr
tasks=['T', 'D|T', 'D|T_ipw', 'D_and_T', 'D_pseudo']
figsize=(7,5)
# figsize=(10,7)
title_en=True
loc='lower right'
legend_ncol=2
plotCorr_w_Unobs(dict_df_probs, models, title, tasks, corr_method=corr_method,
                 figsize=figsize, title_en=title_en, loc=loc,
        calibrated_p_T=calibrated_p_T[test_second_stage['idxs']],
                 calibrated_p_D_T1=calibrated_p_D_T1[test_second_stage['idxs']], legend_ncol=legend_ncol,
                savefig_path=osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage'))

In [None]:
df_probs_stats=pd.DataFrame({'model' : [],'task':[],'mean':[],'std':[], 'min':[]})
for m in models:
    for t in tasks:
        df_probs_stats=df_probs_stats.append({'model' : m,'task':t,'mean':dict_df_probs[m][t].mean(),
                'std':dict_df_probs[m][t].std(), 'min':dict_df_probs[m][t].min()},ignore_index=True)
        print(f" mean and std for model {m} and task {t}: {dict_df_probs[m][t].mean():.3f}, {dict_df_probs[m][t].std():.3f}")

In [None]:
df_probs_stats

In [None]:
title='Distribution of estimated probs'
models=['LGBM']
tasks=['T', 'D|T', 'D|T_ipw', 'D_and_T', 'D_pseudo']
figsize=(10,7)
plotDistributionProbs(dict_df_probs, models, title, tasks, figsize=figsize)

In [None]:
for m in models:
    print(f"for model :{m}")
    getResiduals(dict_df_probs[m]['D_and_T'], dict_df_probs[m]['D|T'], dict_df_probs[m]['T'])