In [None]:
import numpy as np
import pandas as pd
import os
import datetime
import pyarrow.parquet as pq
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, plot_roc_curve, roc_curve, f1_score, average_precision_score
from sklearn.model_selection import GridSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin
import shap

# Loading needed DataFrames

Dataset_A = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Cerebro_Cohort_Unsupervised_Features_All_Clean.pkl")
Dataset_B = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Renal_Cohort_Unsupervised_Features_All_Clean.pkl")
Dataset_C = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Heart_Cohort_Unsupervised_Features_All_Clean.pkl")

Dataset_D = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/CR_and_Heart_Cohort_Unsupervised_Features_All_Clean.pkl")
Dataset_E = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/All3_Cohort_Unsupervised_Features_All_Clean.pkl")


In [None]:
Test = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/ML_dataset_180.pkl')
Test = Test.reindex(sorted(Test.columns), axis=1)
Test.loc[Test['train_test'] == 'test']

In [None]:
#creating Dataset_E for ML_pipelina
#Dataset_E = Dataset_E.reset_index('medical_record_number')
Dataset_E['train_test']= 'train'

#samling 50% to be 'test'
fifty_percent_sample = Dataset_E.sample(frac = 0.5, replace = False, axis = 0) 
fifty_percent_sample['train_test']= 'test'
#updating dataset_E

Dataset_E.update(fifty_percent_sample)

#update Complications to ints
Dataset_E['Complication'] = Dataset_E['Complication'].astype(int)

#Dataset_E.loc[(Dataset_E['train_test'] == 'test') & (Dataset_E['Complication'] == 1)]

#Dataset_E.to_pickle('All3Cohorts_Unsupervised_ML_pipeline.pkl')


In [None]:
# Choose Data Set

#adjust cases and controls
df = Dataset_E

# Set threshold for NaN -> at least 80 columns must be filled with non-naN values per row, otherwise drops row
df = df.dropna(axis =0, thresh=80)
#df.loc[df['Complication'] == 1]

In [None]:
# Set Labels
cases = df.loc[df["Complication"] == 1]
control = df.loc[df["Complication"] == 0]

# Sample data
#cases = cases.sample(300)
control = control.sample(10000)
df = pd.concat([control, cases])
#df

In [None]:
# imputation for other models than lgbm

def impute_df_mean(df):
    # Diagnosis - fill NaN with 0 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Diagnosis")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with df
    df.update(df_NaN_0)
    # Procedure - fill NaN with 0
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Procedure")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with df
    df.update(df_NaN_0)
    # Drug - fill NaN with 0 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Drug")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with  df
    df.update(df_NaN_0)
    # impute lab values and vital signs with mean of columns 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("VitalSign")]]
    df_NaN_0 = df.fillna(df.mean())
    # update back with  df
    df.update(df_NaN_0)
    # impute lab values and vital signs with mean of columns 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("LabValue")]]
    df_NaN_0 = df.fillna(df.mean())
    # update back with  df
    df.update(df_NaN_0)
    return df

In [None]:
##### Parameter Settings #####
To_train = df

# Model Selection
model = 'lgbm'

#impute
#To_train = impute_df_mean(To_train)
 
if (model == 'lgbm'):
    classifier = lgb.LGBMClassifier()        
    param = {'objective': 'binary', # for binary classification
        'boost_from_average': False,
        'is_unbalance': True,
        'boosting': 'gbdt', # traditional gradient boosting decision tree
        'learning_rate': 0.0001,
        'num_leaves': 250,
        'device': 'cpu', # you can use GPU to achieve faster learning
        'max_depth': 45, # <0 means no limit
        'max_bin': 512, # Small number of bins may reduce training accuracy but can deal with over-fitting
        'lambda_l1': 2, # L1 regularization
        'lambda_l2': 0, # L2 regularization
        'subsample_for_bin': 200, # number of samples for constructing bins
        'subsample': 1, # subsample ratio of the training instance
        'colsample_bytree': 0.05, # subsample ratio of columns when constructing the tree
        'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
        'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
        'min_child_samples': 5, # minimum number of data needed in a leaf
        'feature_fraction': 0.5,
        'metric' : 'auc',
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'bagging_fraction':0.8,
        'bagging_freq':10          
        }

else:
    print('Not supported model!')

 

In [None]:
##### Functions #####

def trainModel(classifier,train_features, train_targets, test_features):

    if (model == 'lgbm'):
        train_data = lgb.Dataset(train_features,label=train_targets, feature_name=feature_list)
        crf = lgb.train(param,train_data,valid_sets=[lgb_train, lgb_test], num_boost_round=800)
        
    else:
        print("Not supported model!")
    
    test_pred = crf.predict(test_features)
    train_pred = crf.predict(train_features)
    
    if (model == 'lgbm'):
        for i in range(0,train_pred.shape[0]):
            if train_pred[i] >= .5:       # setting threshold to .5
                train_pred[i] = 1
            else:  
                train_pred[i] = 0
        for i in range(0,test_pred.shape[0]):
            if test_pred[i] >= .5:       # setting threshold to .5
                test_pred[i] = 1
            else:   
                test_pred[i] = 0
    
    return crf, test_pred, train_pred

def evaluateModel(crf, train_targets, train_pred, test_targets, test_pred, test_features):
        
    roc_train = roc_auc_score(train_targets, train_pred)
    roc_test = roc_auc_score(test_targets, test_pred)
    
    cm =confusion_matrix(test_targets, test_pred)

    print(" ROC of train:", roc_train, "\n", "ROC of test:", roc_test, "\n", "Confusion matrix:", "\n", cm)

    # Sensitivity/Recall = TP / (TP + FN)
    sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
    print('Sensitivity: ', sensitivity )

    # Specificity = TN / (TN + FP)
    specificity = cm[1,1]/(cm[1,1]+cm[1,0])
    print('Specificity: ', specificity)

    # Precision = TP / (TP + FP)
    precision = cm[0,0]/(cm[0,0]+cm[1,0])
    print('Precision: ', precision)
    
    # F1 Score
    print("F1 score: " , f1_score(test_targets, test_pred))
    
    # APS
    print("Average Precision Score: ", average_precision_score(test_targets, test_pred))
    

# Required for lgbm, as it does not accept special json characters
if (model == 'lgbm'):
    To_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in To_train.columns]
    

In [None]:
##### Split Data #####

target = np.array(To_train["Complication"])
train = To_train.drop("Complication", axis= 1)
feature_list = list(train.columns)
features = np.array(train)

train_features, test_features, train_targets, test_targets = train_test_split(features, target, test_size = 0.25, random_state = 42)

lgb_train = lgb.Dataset(train_features,train_targets)
lgb_test = lgb.Dataset(test_features,test_targets)


# Feature Normalisation to a range between 0 and 1
min_max_scaler = preprocessing.MinMaxScaler()
train_features = min_max_scaler.fit_transform(train_features)
test_features = min_max_scaler.transform(test_features)


In [None]:
crf, test_pred, train_pred = trainModel(classifier, train_features, train_targets, test_features)

In [None]:
evaluateModel(crf, train_targets, train_pred, test_targets, test_pred, test_features)