In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools
from copy import deepcopy

### set the seeds and change to current directory + set the output directory

In [2]:
SEED=90210
np.random.seed(SEED)
os.environ['USER_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/'
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

In [3]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import trainEvalModel, trainHardPseudo, plotCalibrationPlots, getClippedProbs
from AnalysisFuncs import saveFile, loadFile

In [4]:
%reload_ext autoreload
%autoreload 2

### Create function to pickle functions

In [5]:
def function_to_string(fn):
    return getsource(detect.code(fn)) 

In [6]:
processed_data_path=osp.join(os.environ.get('OUT_PATH'), 'ParksInspection','New_Dataset')

In [7]:
train_X = loadFile(osp.join(processed_data_path,'predict_T'), '/train_X.npy')
train_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/train_y_T.npy')
val_X = loadFile(osp.join(processed_data_path,'predict_T'), '/val_X.npy')
val_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/val_y_T.npy')
train_cross_val_X = loadFile(osp.join(processed_data_path,'predict_T'), '/train_cross_val_X.npy')
train_cross_val_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/train_cross_val_y_T.npy')
test_X = loadFile(osp.join(processed_data_path,'predict_T'), '/test_X.npy')
test_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/test_y_T.npy')

train_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/train_X_D_given_T.npy')
train_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/train_y_D_given_T.npy')
val_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/val_X_D_given_T.npy')
val_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/val_y_D_given_T.npy')
train_cross_val_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), 
                                       '/train_cross_val_X_D_given_T.npy')
train_cross_val_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), 
                                       '/train_cross_val_y_D_given_T.npy')
test_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/test_X_D_given_T.npy')
test_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/test_y_D_given_T.npy')



train_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/train_y_D_and_T.npy')
val_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/val_y_D_and_T.npy')
train_cross_val_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), 
                                       '/train_cross_val_y_D_and_T.npy')
test_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/test_y_D_and_T.npy')

In [8]:
# load df_cleaned and test_idxs
df_cleaned = loadFile(processed_data_path, 'df_cleaned.csv')
test_idxs = loadFile(processed_data_path, '/test_idxs.npy')
train_idxs = loadFile(processed_data_path, '/train_idxs.npy')
val_idxs = loadFile(processed_data_path, '/val_idxs.npy')
assert len(np.intersect1d(train_idxs, val_idxs))==0
assert len(np.intersect1d(test_idxs, val_idxs))==0
assert len(np.intersect1d(train_idxs, test_idxs))==0
tr=len(train_idxs)
v=len(val_idxs)
t=len(test_idxs)
print(f"train %:{tr*100/(tr+v+t):.2f}, val %:{v*100/(tr+v+t):.2f}, test %:{t*100/(tr+v+t):.2f}")

train %:64.01, val %:15.99, test %:19.99


  if await self.run_code(code, result, async_=asy):


In [9]:
# load the best params as pickle if they exist , if not use the default settings
random_state=0
bst_params_T_LR={'random_state': random_state, 
              'solver': 'liblinear', 
              'penalty':  'l1',
                }
bst_params_T_LGBM={'random_state': random_state, 
                  }
LR_T = LogisticRegression(**bst_params_T_LR)
LGBM_T = LGBMClassifier(**bst_params_T_LGBM)

bst_params_D_given_T_LR = {'random_state': random_state, 
                      'solver': 'liblinear', 
                      'penalty':  'l1',
                       }
# bst_params_D_given_T_LGBM={'random_state': random_state, 
#                   }
bst_params_D_given_T_LGBM = {'bagging_fraction': 0.9834006751148752, 
             'feature_fraction': 0.7609241608750359, 'max_depth': 7, 'min_child_samples': 50}

LR_D_given_T = LogisticRegression(**bst_params_D_given_T_LR)
LR_D_given_T_ipw = LogisticRegression(**bst_params_D_given_T_LR)
LGBM_D_given_T = LGBMClassifier(**bst_params_D_given_T_LGBM)
LGBM_D_given_T_ipw = LGBMClassifier(**bst_params_D_given_T_LGBM)
bst_params_D_and_T_LR = {'random_state': random_state, 
                      'solver': 'liblinear', 
                      'penalty':  'l1',
                     }
bst_params_D_and_T_LGBM={'random_state': random_state, 
                  }
LR_D_and_T = LogisticRegression(**bst_params_D_and_T_LR)
LGBM_D_and_T = LGBMClassifier(**bst_params_D_and_T_LGBM)

In [10]:
calibrate_method='sigmoid'
#use calibration method in the path to save the model
processed_data_path = osp.join(processed_data_path, calibrate_method)
processed_data_path

'/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory/ParksInspection/New_Dataset/sigmoid'

In [11]:
train_cross_val_X.shape, test_X.shape, train_cross_val_X.shape[0]+test_X.shape[0]

((167823, 337), (41941, 337), 209764)

In [12]:
model_descr="predicting T using Logistic Regression"
LR_T, test_proba_LR_T = trainEvalModel(LR_T, train_cross_val_X.copy(), train_cross_val_y_T.copy(), 
                                       test_X.copy(), test_y_T.copy(), 
                                       model_descr, calibrate=True, calibrate_method=calibrate_method)
model_descr="predicting T using LGBM"
LGBM_T, test_proba_LGBM_T = trainEvalModel(LGBM_T, train_cross_val_X.copy(), train_cross_val_y_T.copy(),
                                           test_X.copy(), test_y_T.copy(), 
                                       model_descr, calibrate=True, calibrate_method=calibrate_method)

AUC score :predicting T using Logistic Regression: 0.951
AUPR score :predicting T using Logistic Regression: 0.934
AUC score :predicting T using LGBM: 0.963
AUPR score :predicting T using LGBM: 0.951


In [13]:
#save models
saveFile(osp.join(processed_data_path,'predict_T'), LR_T, 'LR_T.pkl')
saveFile(osp.join(processed_data_path,'predict_T'), LGBM_T, 'LGBM_T.pkl')

In [14]:
model_descr="predicting D|T using Logistic Regression"
LR_D_given_T, test_proba_LR_D_given_T = trainEvalModel(LR_D_given_T, train_cross_val_X_D_given_T.copy(), 
                    train_cross_val_y_D_given_T.copy(), test_X_D_given_T.copy(), test_y_D_given_T.copy(),
                                        model_descr, calibrate=True, calibrate_method=calibrate_method)

AUC score :predicting D|T using Logistic Regression: 0.677
AUPR score :predicting D|T using Logistic Regression: 0.541


In [15]:
model_descr="predicting D|T using LGBM"
LGBM_D_given_T, test_proba_LGBM_D_given_T = trainEvalModel(LGBM_D_given_T, train_cross_val_X_D_given_T.copy(), 
                    train_cross_val_y_D_given_T.copy(), test_X_D_given_T.copy(), test_y_D_given_T.copy(),
                                            model_descr, calibrate=True, calibrate_method=calibrate_method)

AUC score :predicting D|T using LGBM: 0.687
AUPR score :predicting D|T using LGBM: 0.559


In [16]:
saveFile(osp.join(processed_data_path,'predict_D_given_T'), LR_D_given_T, 'LR_D_given_T.pkl')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), LGBM_D_given_T, 'LGBM_D_given_T.pkl')

In [17]:
model_descr="predicting D,T using Logistic Regression"
LR_D_and_T, test_proba_LR_D_and_T = trainEvalModel(LR_D_and_T, train_cross_val_X.copy(), 
                    train_cross_val_y_D_and_T.copy(), test_X.copy(), test_y_D_and_T.copy(),
                                        model_descr, calibrate=True, calibrate_method=calibrate_method)
model_descr="predicting D,T using LGBM"
LGBM_D_and_T, test_proba_LGBM_D_and_T = trainEvalModel(LGBM_D_and_T, train_cross_val_X.copy(), 
                    train_cross_val_y_D_and_T.copy(), test_X.copy(), test_y_D_and_T.copy(),
                                        model_descr, calibrate=True, calibrate_method=calibrate_method)

AUC score :predicting D,T using Logistic Regression: 0.843
AUPR score :predicting D,T using Logistic Regression: 0.485
AUC score :predicting D,T using LGBM: 0.850
AUPR score :predicting D,T using LGBM: 0.504


In [18]:
saveFile(osp.join(processed_data_path,'predict_D_and_T'), LR_D_and_T, 'LR_D_and_T.pkl')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), LGBM_D_and_T, 'LGBM_D_and_T.pkl')

In [19]:
print(LGBM_T)

CalibratedClassifierCV(base_estimator=LGBMClassifier(boosting_type='gbdt',
                                                     class_weight=None,
                                                     colsample_bytree=1.0,
                                                     importance_type='split',
                                                     learning_rate=0.1,
                                                     max_depth=-1,
                                                     min_child_samples=20,
                                                     min_child_weight=0.001,
                                                     min_split_gain=0.0,
                                                     n_estimators=100,
                                                     n_jobs=-1, num_leaves=31,
                                                     objective=None,
                                                     random_state=0,
                                                

In [None]:
model_descr="predicting D|T_IPW using Logistic Regression"
# pass dataframe and create D_given_T again
train_X_D_given_T_probs_LR, train_cross_val_X_ipw = getClippedProbs(LR_T,
                                            train_cross_val_X.copy(), train_cross_val_y_T.copy())
np.testing.assert_array_equal(train_cross_val_X_ipw, train_cross_val_X_D_given_T)
LR_D_given_T_ipw, test_proba_LR_D_given_T_ipw = trainEvalModel(LR_D_given_T_ipw, train_cross_val_X_ipw, 
                    train_cross_val_y_D_given_T, test_X_D_given_T.copy(), test_y_D_given_T.copy(),
                    model_descr, calibrate=True, calibrate_method=calibrate_method,
                    sample_weight=1/train_X_D_given_T_probs_LR.copy())
model_descr="predicting D|T_IPW using LGBM"
train_X_D_given_T_probs_LGBM, train_cross_val_X_ipw = getClippedProbs(LGBM_T, 
                                        train_cross_val_X.copy(), train_cross_val_y_T.copy())
LGBM_D_given_T_ipw, test_proba_LGBM_D_given_T_ipw = trainEvalModel(LGBM_D_given_T_ipw, train_cross_val_X_ipw,
                    train_cross_val_y_D_given_T,
                    test_X_D_given_T.copy(), test_y_D_given_T.copy(),
                    model_descr, calibrate=True, calibrate_method=calibrate_method,
                    sample_weight=1/train_X_D_given_T_probs_LGBM.copy())

In [21]:
saveFile(osp.join(processed_data_path,'predict_D_given_T_ipw'), LR_D_given_T_ipw, 'LR_D_given_T_ipw.pkl')
saveFile(osp.join(processed_data_path,'predict_D_given_T_ipw'), LGBM_D_given_T_ipw, 'LGBM_D_given_T_ipw.pkl')

#### Hard PseudoLabels

In [22]:
random_state=0
bst_params_D_pseudo_LR = bst_params_D_given_T_LR
bst_params_D_pseudo_LGBM = bst_params_D_given_T_LGBM
LR_D_pseudo = LogisticRegression(**bst_params_D_pseudo_LR)
LGBM_D_pseudo = LGBMClassifier(**bst_params_D_pseudo_LGBM)

In [23]:
#predict the outcomes
#assert that the pseudo labels are only binary in nature and not probs
train_cross_val_idxs=np.concatenate((train_idxs, val_idxs))
LR_D_pseudo, test_probs_D_pseudo_LR = trainHardPseudo(deepcopy(LR_D_given_T), df_cleaned.copy(), train_idxs.copy(), 
                        val_idxs.copy(), train_X.copy(), val_X.copy(), deepcopy(LR_D_given_T), 
                        test_X_D_given_T.copy(), test_y_D_given_T.copy(), 
        model_descr="Logistic Regression with hard pseudo labels", calibrate=True, calibrate_method=calibrate_method)

AUC score :Logistic Regression with hard pseudo labels: 0.672
AUPR score :Logistic Regression with hard pseudo labels: 0.537


In [24]:
LGBM_D_pseudo, test_probs_D_pseudo_LGBM = trainHardPseudo(deepcopy(LGBM_D_given_T), df_cleaned.copy(), 
                        train_idxs.copy(), val_idxs.copy(),
                        train_X.copy(), val_X.copy().copy(), deepcopy(LGBM_D_given_T), test_X_D_given_T.copy(), 
                        test_y_D_given_T.copy(), 
                        model_descr="LGBM with hard pseudo labels", calibrate=True, calibrate_method=calibrate_method)

Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
AUC score :LGBM with hard pseudo labels: 0.679
AUPR score :LGBM with hard pseudo labels: 0.546


In [25]:
#save models
saveFile(osp.join(processed_data_path,'predict_D_pseudo'), LR_D_pseudo, 'LR_D_pseudo.pkl')
saveFile(osp.join(processed_data_path,'predict_D_pseudo'), LGBM_D_pseudo, 'LGBM_D_pseudo.pkl')

In [26]:
train_cross_val_X.shape

(167823, 337)

In [27]:
train_cross_val_y_T.shape

(167823,)