In [None]:
import numpy as np
import pandas as pd
from time import gmtime, strftime
import gc
import seaborn as sns

import sklearn
from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, plot_roc_curve, roc_curve, f1_score, average_precision_score
from sklearn.model_selection import GridSearchCV

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin
import shap

from sklearn.preprocessing import MinMaxScaler

In [None]:
# Loading needed DataFrames

Dataset_A = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Cerebro_Cohort_Unsupervised_Features_All_Clean.pkl")
Dataset_B = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Renal_Cohort_Unsupervised_Features_All_Clean.pkl")
Dataset_C = pd.read_pickle("/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/Heart_Cohort_Unsupervised_Features_All_Clean.pkl")

# Choose Data Set
df = Dataset_C

# Set threshold for NaN -> at least 80 columns must be filled with non-naN values per row, otherwise drops row
#df = df.dropna(axis =0, thresh=30)
#df.loc[df['Complication'] == 0]

# Set Labels
cases = df.loc[df["Complication"] == 1]
control = df.loc[df["Complication"] == 0]

# Sample data
cases = cases.sample(100)
control = control.sample(100)
df = pd.concat([control, cases])


In [None]:
# imputation for other models than lgbm

def impute_df_mean(df):
    # Diagnosis - fill NaN with 0 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Diagnosis")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with df
    df.update(df_NaN_0)
    # Procedure - fill NaN with 0
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Procedure")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with df
    df.update(df_NaN_0)
    # Drug - fill NaN with 0 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("Drug")]]
    df_NaN_0 = df_NaN.fillna(0)
    # update back with  df
    df.update(df_NaN_0)
    # impute lab values and vital signs with mean of columns 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("VitalSign")]]
    df_NaN_0 = df.fillna(df.mean())
    # update back with  df
    df.update(df_NaN_0)
    # impute lab values and vital signs with mean of columns 
    df_NaN = df[df.columns[pd.Series(df.columns).str.contains("LabValue")]]
    df_NaN_0 = df.fillna(df.mean())
    # update back with  df
    df.update(df_NaN_0)
    return df

In [None]:
#rename dataset and model
To_train = df

#impute
To_train = impute_df_mean(To_train)

model = 'lgbm'



In [None]:
##### Split Data #####
target = np.array(To_train["Complication"])
train = To_train.drop("Complication", axis= 1)
feature_list = list(train.columns)
features = np.array(train)

train_features, test_features, train_targets, test_targets = train_test_split(features, target, test_size = 0.25, random_state = 42)


In [None]:
# Feature Normalisation to a range between 0 and 1
min_max_scaler = preprocessing.MinMaxScaler()
train_features = min_max_scaler.fit_transform(train_features)
test_features = min_max_scaler.transform(test_features)

In [None]:
lgb_classifier = lgb.LGBMClassifier()

#param = {'objective': 'binary', 'metric': ['auc', 'binary_logloss']}

#grid = [{'num_leaves': [31, 127],
        #'reg_alpha': [0.1, 0.5],
        #'min_data_in_leaf': [30, 50, 100, 300, 400],
        #'lambda_l1': [0, 1, 1.5],
        #'lambda_l2': [0, 1]
    #}]



In [None]:
##### Functions #####

def trainModel(classifier,train_features, train_targets, test_features):
    crf = lgb_classifier 
    train_data = lgb.Dataset(train_features,label=train_targets, feature_name=feature_list)

    test_pred = crf.predict(test_features)
    train_pred = crf.predict(train_features)
    
    if (model == 'lgbm'):
        for i in range(0,train_pred.shape[0]):
            if train_pred[i] >= .5:       # setting threshold to .5
                train_pred[i] = 1
            else:  
                train_pred[i] = 0
        for i in range(0,test_pred.shape[0]):
            if test_pred[i] >= .5:       # setting threshold to .5
                test_pred[i] = 1
            else:   
                test_pred[i] = 0
    
    return crf, test_pred, train_pred

In [None]:
def evaluateModel(crf, train_targets, train_pred, test_targets, test_pred, test_features):
        
    roc_train = roc_auc_score(train_targets, train_pred)
    roc_test = roc_auc_score(test_targets, test_pred)
    
    cm =confusion_matrix(test_targets, test_pred)

    print(" ROC of train:", roc_train, "\n", "ROC of test:", roc_test, "\n", "Confusion matrix:", "\n", cm)

    # Sensitivity/Recall = TP / (TP + FN)
    sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
    print('Sensitivity: ', sensitivity )

    # Specificity = TN / (TN + FP)
    specificity = cm[1,1]/(cm[1,1]+cm[1,0])
    print('Specificity: ', specificity)

    # Precision = TP / (TP + FP)
    precision = cm[0,0]/(cm[0,0]+cm[1,0])
    print('Precision: ', precision)
    
    # F1 Score
    print("F1 score: " , f1_score(test_targets, test_pred))
    
    # APS
    print("Average Precision Score: ", average_precision_score(test_targets, test_pred))


In [None]:
def algorithm_pipeline(train_features, test_features, train_targets, test_targets, 
                       lgb_classifier, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=lgb_classifier,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(train_features, train_targets)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(test_features)
    else:
      pred = fitted_model.predict(test_features)
    
    return fitted_model, pred

In [None]:
param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8, 1],
    'max_depth': [-1,7,15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9, 1],
    'subsample_freq': [0, 20]
}

lgb_classifier, pred = algorithm_pipeline(train_features, test_features, train_targets, test_targets, lgb_classifier, 
                                 param_grid, cv=5, scoring_fit='accuracy')


classifier = lgb_classifier.best_params_

In [None]:
classifier

In [None]:
crf, test_pred, train_pred = trainModel(classifier, train_features, train_targets, test_features)

In [None]:
evaluateModel(crf, train_targets, train_pred, test_targets, test_pred, test_features)