In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools

### set the seeds and change to current directory + set the output directory

In [3]:
SEED=90210
np.random.seed(SEED)
os.environ['USER_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/'
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

### Create function to pickle functions

In [4]:
def function_to_string(fn):
    return getsource(detect.code(fn)) 


In [5]:
processed_data_path=osp.join(os.environ.get('OUT_PATH'), 'ParksInspection')


In [6]:
os.environ['WANDB_SILENT']="true"


In [7]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import plotCorr, getPred_fromProb, getMetrics, loadModelData, getCorr, getGroundTruth, getURange
from AnalysisFuncs import getResiduals, plotDistributionProbs, plotCorr_w_Unobs
from AnalysisFuncs import saveFile, loadFile
from AnalysisFuncs import hypertune


In [8]:
train_X = loadFile(osp.join(processed_data_path,'predict_T'), '/train_X.npy')
train_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/train_y_T.npy')
val_X = loadFile(osp.join(processed_data_path,'predict_T'), '/val_X.npy')
val_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/val_y_T.npy')
train_cross_val_X = loadFile(osp.join(processed_data_path,'predict_T'), '/train_cross_val_X.npy')
train_cross_val_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/train_cross_val_y_T.npy')
test_X = loadFile(osp.join(processed_data_path,'predict_T'), '/test_X.npy')
test_y_T = loadFile(osp.join(processed_data_path,'predict_T'), '/test_y_T.npy')

train_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/train_X_D_given_T.npy')
train_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/train_y_D_given_T.npy')
val_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/val_X_D_given_T.npy')
val_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/val_y_D_given_T.npy')
train_cross_val_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), 
                                       '/train_cross_val_X_D_given_T.npy')
train_cross_val_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), 
                                       '/train_cross_val_y_D_given_T.npy')
test_X_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/test_X_D_given_T.npy')
test_y_D_given_T = loadFile(osp.join(processed_data_path,'predict_D_given_T'), '/test_y_D_given_T.npy')



train_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/train_y_D_and_T.npy')
val_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/val_y_D_and_T.npy')
train_cross_val_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), 
                                       '/train_cross_val_y_D_and_T.npy')
test_y_D_and_T = loadFile(osp.join(processed_data_path,'predict_D_and_T'), '/test_y_D_and_T.npy')


In [10]:
# load models
LR_model=LogisticRegression()
LGBM_model = LGBMClassifier()


In [None]:
#hypertune with LR model
params_dict_LR = dict(C=stats.uniform(loc=0, scale=4), penalty=['l2', 'l1'], solver=['liblinear', 'saga'])
bst_params_T_LR = hypertune(LR_model, params_dict_LR, train_cross_val_X.astype(float), train_cross_val_y_T, cv=5)
bst_params_D_given_T_LR = hypertune(LR_model, params_dict_LR, train_cross_val_X_D_given_T.astype(float),
                                    train_cross_val_y_D_given_T, cv=5)
bst_params_D_and_T_LR = hypertune(LR_model, params_dict_LR, train_cross_val_X.astype(float), 
                                  train_cross_val_y_D_and_T, cv=5)


In [26]:
# save the bst params dict
saveFile(osp.join(processed_data_path, 'predict_T'), bst_params_T_LR, 'bst_params_T_LR.pkl')
saveFile(osp.join(processed_data_path, 'predict_D_given_T'), bst_params_D_given_T_LR, 'bst_params_D_given_T_LR.pkl')
saveFile(osp.join(processed_data_path, 'predict_D_and_T'), bst_params_D_and_T_LR, 'bst_params_D_and_T_LR.pkl')


In [21]:
assert np.isnan(train_cross_val_X.astype(float)).sum()==0
assert np.isnan(train_cross_val_y_T).sum()==0
assert np.isnan(train_cross_val_X_D_given_T.astype(float)).sum()==0
assert np.isnan(train_cross_val_y_D_given_T).sum()==0
assert np.isnan(train_cross_val_y_D_and_T).sum()==0

In [24]:
#hypertune with LGBM model
params_dict_LGBM = dict(max_depth=[3,4,5,6,7], 
                        feature_fraction=stats.uniform(loc=0.5, scale=0.5), 
                        bagging_fraction=stats.uniform(loc=0.7, scale=0.3),
                        min_child_samples=[10,20,30,50])
bst_params_T_LGBM = hypertune(LGBM_model, params_dict_LGBM, train_cross_val_X.astype(float), train_cross_val_y_T, cv=5)
bst_params_D_given_T_LGBM = hypertune(LGBM_model, params_dict_LGBM, train_cross_val_X_D_given_T.astype(float),
                                    train_cross_val_y_D_given_T, cv=5)
bst_params_D_and_T_LGBM = hypertune(LGBM_model, params_dict_LGBM, train_cross_val_X.astype(float), train_cross_val_y_D_and_T, cv=5)




cv results:{'mean_fit_time': array([0.64444294, 0.56282129, 0.64804201, 0.37479377, 0.44297709,
       0.37259903, 0.60973988, 0.44543204, 0.67726703, 0.67754774]), 'std_fit_time': array([0.02356516, 0.01728756, 0.01550124, 0.01682354, 0.00787061,
       0.01836181, 0.03355267, 0.01241207, 0.04920866, 0.04635398]), 'mean_score_time': array([0.02025628, 0.01740475, 0.02170081, 0.01221781, 0.01491022,
       0.01204166, 0.01994123, 0.01466799, 0.02204804, 0.02160196]), 'std_score_time': array([0.00018117, 0.0001399 , 0.00033846, 0.00020921, 0.00070359,
       0.000108  , 0.00024463, 0.0002574 , 0.00022573, 0.00031297]), 'param_bagging_fraction': masked_array(data=[0.8646440511781974, 0.863464954899069,
                   0.7892603819633417, 0.8432995351964049,
                   0.9776789914877982, 0.7060655192320977,
                   0.8420824135821131, 0.7354823277606799,
                   0.9834006751148752, 0.7793666836313881],
             mask=[False, False, False, False, False,



cv results:{'mean_fit_time': array([0.46495028, 0.4281496 , 0.50112128, 0.28396244, 0.3410223 ,
       0.28251171, 0.45989137, 0.34096017, 0.5060822 , 0.52924533]), 'std_fit_time': array([0.01114858, 0.01159806, 0.01520752, 0.00564613, 0.00230454,
       0.00378786, 0.00892352, 0.00584682, 0.01463139, 0.01628099]), 'mean_score_time': array([0.01379862, 0.01214352, 0.0149405 , 0.00901251, 0.01042948,
       0.00902534, 0.01378255, 0.01031876, 0.0151814 , 0.01513619]), 'std_score_time': array([2.19799825e-04, 1.65201925e-04, 2.20406442e-04, 1.84055091e-04,
       4.11017588e-05, 8.43620484e-05, 9.12520318e-05, 1.79546255e-04,
       1.36503380e-04, 3.33461924e-04]), 'param_bagging_fraction': masked_array(data=[0.8646440511781974, 0.863464954899069,
                   0.7892603819633417, 0.8432995351964049,
                   0.9776789914877982, 0.7060655192320977,
                   0.8420824135821131, 0.7354823277606799,
                   0.9834006751148752, 0.7793666836313881],
      



cv results:{'mean_fit_time': array([0.62428513, 0.56146998, 0.63621836, 0.37207789, 0.44146252,
       0.36917734, 0.6048048 , 0.44594116, 0.6774961 , 0.67265301]), 'std_fit_time': array([0.03758175, 0.01764236, 0.02291805, 0.00890473, 0.00724858,
       0.00691304, 0.02964446, 0.0093057 , 0.02951265, 0.02779237]), 'mean_score_time': array([0.01882381, 0.01637778, 0.02019868, 0.01218438, 0.0140759 ,
       0.01235943, 0.01843233, 0.01418514, 0.02020016, 0.0200357 ]), 'std_score_time': array([7.20874578e-04, 2.29156181e-04, 1.71179001e-04, 1.55679774e-04,
       1.55389049e-04, 1.27995600e-04, 1.75109732e-04, 2.16056142e-04,
       1.50751191e-04, 5.19385777e-05]), 'param_bagging_fraction': masked_array(data=[0.8646440511781974, 0.863464954899069,
                   0.7892603819633417, 0.8432995351964049,
                   0.9776789914877982, 0.7060655192320977,
                   0.8420824135821131, 0.7354823277606799,
                   0.9834006751148752, 0.7793666836313881],
      