For the cutoff model and LGBM_w_feat only

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools
from copy import deepcopy

### set the seeds and change to current directory + set the output directory

In [None]:
SEED=42
np.random.seed(SEED)
os.environ['USER_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/'
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

In [None]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import trainEvalModel, trainHardPseudo, secondStageDataGen
from AnalysisFuncs import saveFile, loadFile, getClippedProbs, plotCalibrationPlots

In [None]:
%reload_ext autoreload
%autoreload 2

### Create function to pickle functions

In [None]:
processed_data_path=osp.join(os.environ.get('USER_PATH'), 'HIRID_Repo', 'logs', 'benchmark_exp')

In [None]:
test_X = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
                '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_rep.pkl')
test_ids = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
        '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_patient_ids.pkl')

In [None]:
calibrated_p_T = loadFile(osp.join(processed_data_path,'probs_T'), '/probs.npy')
calibrated_p_D_T1 = loadFile(osp.join(processed_data_path,'probs_D|T'), '/probs.npy')

In [None]:
assert len(calibrated_p_T)==len(test_X)==len(test_ids)==len(calibrated_p_D_T1)

In [None]:
# check that these are indeed the calibrated probabilities 
fig1,ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
test_y_T = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
                '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'test_label.pkl')
plotCalibrationPlots(calibrated_p_T, test_y_T, None, 'T', ax1, ax2, n_bins=10)
fig1.show()
fig2.show()
plt.close()
print(len(test_y_T))

In [None]:
fig1,ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
test_y_D_given_T = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
            '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Above_Threshold', '1111'), 'test_label.pkl')
plotCalibrationPlots(calibrated_p_D_T1[test_y_T==1], test_y_D_given_T, None, 'D|T', ax1, ax2, n_bins=10)
fig1.show()
fig2.show()
plt.close()
print(len(test_y_D_given_T))

In [None]:
train_second_stage, test_second_stage, _ = secondStageDataGen(test_ids,
                                        calibrated_p_T.copy(), calibrated_p_D_T1.copy(), train_size=0.5)
assert np.isnan(train_second_stage['D'][train_second_stage['T']==0]).all()
assert np.isnan(test_second_stage['D'][test_second_stage['T']==0]).all()
assert not np.isnan(train_second_stage['D'][train_second_stage['T']==1]).any()
assert not np.isnan(test_second_stage['D'][test_second_stage['T']==1]).any()
assert len(train_second_stage['T'])==len(train_second_stage['D'])==len(train_second_stage['D_and_T'])
assert len(test_second_stage['T'])==len(test_second_stage['D'])==len(test_second_stage['D_and_T'])

In [None]:
np.isnan(train_second_stage['D'][train_second_stage['T']==1]).all()

In [None]:
train_second_stage

In [None]:
test_X_copy=test_X.copy()
train_second_stage['X_T'] = test_X_copy[train_second_stage['idxs'],:]
train_second_stage['X_D_T1'] = test_X_copy[train_second_stage['idxs'],:][train_second_stage['T']==1,:]
train_second_stage['D_T1'] = train_second_stage['D'][train_second_stage['T']==1].astype(int)
test_second_stage['X_T'] = test_X_copy[test_second_stage['idxs'],:]
test_second_stage['X_D_T1'] = test_X_copy[test_second_stage['idxs']][test_second_stage['T']==1,:]
test_second_stage['D_T1'] = test_second_stage['D'][test_second_stage['T']==1].astype(int)

In [None]:
train_second_stage['X_D_T1']

In [None]:
print("for test train set")
print(f"T % :{len(train_second_stage['T'][train_second_stage['T']==1])/len(train_second_stage['T'])}")
print(f"D_T1 % :{len(train_second_stage['D_T1'][train_second_stage['D_T1']==1])/len(train_second_stage['D_T1'])}")
print(f"D_and_T % :{len(train_second_stage['D_and_T'][train_second_stage['D_and_T']==1])/len(train_second_stage['D_and_T'])}")
print("for test test set")
print(f"T % :{len(test_second_stage['T'][test_second_stage['T']==1])/len(test_second_stage['T'])}")
print(f"D_T1 % :{len(test_second_stage['D_T1'][test_second_stage['D_T1']==1])/len(test_second_stage['D_T1'])}")
print(f"D_and_T % :{len(test_second_stage['D_and_T'][test_second_stage['D_and_T']==1])/len(test_second_stage['D_and_T'])}")

In [None]:
# look at feature names
feature_names = loadFile(osp.join(processed_data_path,'LGBM_w_feat_v2_cutoff_T', 
                '_depth_7_subsample-data_1.0_subsample-feat_1.0', 'Lactate_Measured', '1111'), 'feature_names.pkl')

In [None]:
len(feature_names), test_X.shape

In [None]:
feature_names

In [None]:
# load the best params as pickle if they exist , if not use the default settings
random_state=0
bst_params_T_LGBM={'random_state': random_state, 
                  }
LGBM_T = LGBMClassifier(**bst_params_T_LGBM)
bst_params_D_given_T_LGBM = {'max_depth': 7}
LGBM_D_given_T = LGBMClassifier(**bst_params_D_given_T_LGBM)
LGBM_D_given_T_ipw = LGBMClassifier(**bst_params_D_given_T_LGBM)
bst_params_D_and_T_LGBM={'random_state': random_state, 
                  }
LGBM_D_and_T = LGBMClassifier(**bst_params_D_and_T_LGBM)
calibrate_method='sigmoid'

In [None]:
model_descr="predicting T using LGBM for second stage"
LGBM_T, test_proba_LGBM_T = trainEvalModel(LGBM_T, train_second_stage['X_T'], train_second_stage['T'], 
                                           test_second_stage['X_T'], test_second_stage['T'], 
                                       model_descr, calibrate=True, calibrate_method=calibrate_method)

In [None]:
model_descr="predicting D|T using LGBM for second stage"
LGBM_D_given_T, test_proba_LGBM_D_given_T = trainEvalModel(LGBM_D_given_T, train_second_stage['X_D_T1'], 
                    train_second_stage['D_T1'], test_second_stage['X_D_T1'], 
                                            test_second_stage['D_T1'],
                                            model_descr, calibrate=True, calibrate_method=calibrate_method)

In [None]:
model_descr="predicting D,T using LGBM for second stage"
LGBM_D_and_T, test_proba_LGBM_D_and_T = trainEvalModel(LGBM_D_and_T, train_second_stage['X_T'], 
                    train_second_stage['D_and_T'], test_second_stage['X_T'],
                                        test_second_stage['D_and_T'],
                                        model_descr, calibrate=True, calibrate_method=calibrate_method)

In [None]:
LGBM_T

In [None]:
model_descr="predicting D|T_IPW using LGBM for second stage"
train_X_D_given_T_probs_LGBM, train_cross_val_X_ipw = getClippedProbs(LGBM_T, train_second_stage['X_T'].copy(),
                                              train_second_stage['T'].copy())
assert np.allclose(train_second_stage['X_D_T1'], train_cross_val_X_ipw)


In [None]:
train_second_stage['X_T'].shape

In [None]:
LGBM_D_given_T_ipw, test_proba_LGBM_D_given_T_ipw = trainEvalModel(LGBM_D_given_T_ipw, 
        train_second_stage['X_D_T1'].copy(), 
        train_second_stage['D_T1'].copy(), test_second_stage['X_D_T1'], test_second_stage['D_T1'],
        model_descr, calibrate=True, sample_weight=1/train_X_D_given_T_probs_LGBM.copy(),
        calibrate_method=calibrate_method)

#### Hard PseudoLabels

In [None]:
random_state=0
bst_params_D_pseudo_LGBM = bst_params_D_given_T_LGBM
LGBM_D_pseudo = LGBMClassifier(**bst_params_D_pseudo_LGBM)

In [None]:
LGBM_D_pseudo, test_probs_D_pseudo_LGBM = trainHardPseudo(deepcopy(LGBM_D_given_T), None, None, None,
        train_second_stage['X_T'], None, deepcopy(LGBM_D_given_T), test_second_stage['X_D_T1'].copy(),
                                                          test_second_stage['D_T1'].copy(), 
        model_descr="LGBM with hard pseudo labels for second stage", calibrate=True, 
        calibrate_method=calibrate_method,
        train_y_D=train_second_stage['D'].copy())


In [None]:
train_second_stage['D'][train_second_stage['D']==np.nan]

In [None]:
all(np.isnan(train_second_stage['D'][train_second_stage['T']==0]))

In [None]:
processed_data_path

In [None]:
#save models
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage','predict_T'), LGBM_T, 'LGBM_T.pkl')
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 'predict_D_given_T'),
         LGBM_D_given_T, 'LGBM_D_given_T.pkl')
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 'predict_D_and_T'), LGBM_D_and_T,
         'LGBM_D_and_T.pkl')
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 'predict_D_given_T_ipw'),
         LGBM_D_given_T_ipw, 'LGBM_D_given_T_ipw.pkl')
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage', 'predict_D_pseudo'), 
         LGBM_D_pseudo, 'LGBM_D_pseudo.pkl')

In [None]:
processed_data_path

In [None]:
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage'), train_second_stage, 
         'train_second_stage.pkl')
saveFile(osp.join(processed_data_path, 'LGBM_w_feat_v2_cutoff_T', 'secondStage'), test_second_stage, 
         'test_second_stage.pkl')