In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

import csv
from IPython.display import display
import sys
import time
import pandas as pd
import numpy as np
import tqdm
import shap

import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, PredefinedSplit, cross_validate, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
from sklearn.multioutput import MultiOutputClassifier
import imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb


In [None]:
%run ml_functions.py

In [None]:
demo_train, demo_test=[],[]
cond_train, cond_test=[],[]
util_train, util_test=[],[]
lab_cts_train, lab_cts_test=[],[]
lab_train, lab_test=[],[]
drug_train,drug_test=[],[]
proc_train, proc_test=[],[]

In [None]:
for i in range(1,6):
    demo_train.append(pd.read_csv(make_path("demo","train", i), index_col=[0]).drop(columns=['index_visit_occurrence_id', 'birth_date', 'index_month', 'index_year', 'index_date', 'observation_type']))
    demo_test.append(pd.read_csv(make_path("demo","test", i), index_col=[0]).drop(columns=['index_visit_occurrence_id', 'birth_date', 'index_month', 'index_year', 'index_date', 'observation_type']))
    cond_train.append(pd.read_csv(make_path("cond","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    cond_test.append(pd.read_csv(make_path("cond","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    util_train.append(pd.read_csv(make_path("util","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    util_test.append(pd.read_csv(make_path("util","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    lab_cts_train.append(pd.read_csv(make_path("lab_cts","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    lab_cts_test.append(pd.read_csv(make_path("lab_cts","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    lab_train.append(pd.read_csv(make_path("lab","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    lab_test.append(pd.read_csv(make_path("lab","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    drug_train.append(pd.read_csv(make_path("drug","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    drug_test.append(pd.read_csv(make_path("drug","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    proc_train.append(pd.read_csv(make_path("proc","train", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))
    proc_test.append(pd.read_csv(make_path("proc","test", i), index_col=[0]).drop(columns=['index_date', 'pasc_flag']))

In [None]:
folds=[]

In [None]:
for i in range(0,5):
    train_feature_df=make_feature_df(demo_train[i], cond_train[i], drug_train[i], util_train[i], lab_train[i], proc_train[i], lab_cts_train[i])
    test_feature_df=make_feature_df(demo_test[i], cond_test[i], drug_test[i], util_test[i], lab_test[i], proc_test[i], lab_cts_test[i])
    folds.append([train_feature_df[0], train_feature_df[1], test_feature_df[0], test_feature_df[1]])

In [None]:
results_list=[]
results_list_lower=[]
for X_train_temp, y_train, X_test_temp, y_test in folds:
    X_train, X_test=drop_diff(X_train_temp, X_test_temp)
    y_train_multiclass=(y_train[:,0]+y_train[:,1])
    y_test_multiclass=(y_test[:,0]+y_test[:,1])
    cols=X_train.columns.values
    dim=X_train.shape[1]
    ct_scale=ColumnTransformer([('scale', StandardScaler(), cols)], remainder="passthrough") 
    X_scaled_train=ct_scale.fit_transform(X_train)
    X_scaled_test=ct_scale.fit_transform(X_test)
    xgb_estimator = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, colsample_bytree=0.7, objective='multi:softprob', num_class=3, n_jobs=16)
    xgb_estimator.fit(X_scaled_train, y_train_multiclass)
    results=score_xg_multiclass(xgb_estimator, X_scaled_test, y_test_multiclass, thresh=0.5)
    results_lower=score_xg_multiclass(xgb_estimator, X_scaled_test, y_test_multiclass, thresh=0.2)
    print(results[0])
    results_list.append(results)

In [None]:
pd.DataFrame([results_list[0][0], results_list[1][0], results_list[2][0], results_list[3][0], results_list[4][0]]).to_csv("../results/cv_results.csv")

for i in range(5):
    pd.DataFrame(results_list[i][1]).to_csv("".join(["../specs/y_test_",str(i+1),".csv"]))
    pd.DataFrame(results_list[i][2]).to_csv("".join(["../specs/y_prob_",str(i+1),".csv"]))

In [None]:
get_scores(results_list[0][2]['y_prob_pasc_any'], results_list[0][2]['y_prob_misc'], 
           results_list[0][2]['y_prob_non_misc'], results_list[0][1]['y_test_pasc_any'], 
           results_list[0][1]['y_test_misc'], results_list[0][1]['y_test_non_misc'], 0.2)

In [None]:
results_lower=score_xg_multiclass(xgb_estimator, X_scaled_test, y_test_multiclass, thresh=0.2)

In [None]:
get_scores(y_prob_pasc_any, y_prob_misc, y_prob_non_misc, y_test_pasc_any, y_test_misc, y_test_non_misc, thresh)