In [76]:
# Load the readmission dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("final_data.csv")

In [77]:
df.keys()

Index(['Unnamed: 0', 'Unnamed: 0.1', 'bed_day_dep', 'DRG', 'gender',
       'timestamp_in', 'timestamp_out', 'degree_of_urgency', 'patientID',
       'hosp_unique_ID', 'mainDiagICD10Chapter', 'mainDiagBlock',
       'diagnosis_code_1', 'age_group', 'bed_day_hosp', 'readmission', 'mors',
       'lead_to_readmission'],
      dtype='object')

Features that will not be used for prediction and set target variable

In [78]:
# category 1
features_not_predictive = ['Unnamed: 0','Unnamed: 0.1', 'bed_day_dep', 'timestamp_in', 'timestamp_out', 'hosp_unique_ID', 'mainDiagICD10Chapter', 'diagnosis_code_1', 'readmission', 'mors']

# category 2
features_to_remove = []
# category 3 (the rest)

# features to drop is category 1 & category 2
features_to_drop = features_not_predictive + features_to_remove
# and the outcome:
label = "lead_to_readmission"

In [79]:
df['gender'] = df.gender.astype('category')
df['age_group']= df.age_group.astype('category')
df['mainDiagBlock'] = df.mainDiagBlock.astype('category')

In [82]:
# Define the dataset and the target variable

X = df.drop(columns=features_to_drop+[label])
y = df[[label]]


In [84]:
# Split the patients (20% in test set, 60% in train and 10% in validation set)

from sklearn.model_selection import train_test_split

patient_list = pd.unique(df['patientID'])

patient_train, patient_test = train_test_split(patient_list, test_size=0.3, random_state=42) #70 percent test data
patient_val, patient_test = train_test_split(patient_test, test_size=0.6, random_state=42) 



In [85]:
#Split the dataset

train_data = X[X['patientID'].isin(patient_train)]
val_data = X[X['patientID'].isin(patient_val)]
test_data = X[X['patientID'].isin(patient_test)]

y = y.join(X['patientID'])

train_y = y[y['patientID'].isin(patient_train)]
val_y = y[y['patientID'].isin(patient_val)]
test_y = y[y['patientID'].isin(patient_test)]

train_y = train_y.drop('patientID', axis=1)
val_y = val_y.drop('patientID', axis=1)
test_y = test_y.drop('patientID', axis=1)



Unnamed: 0,lead_to_readmission
1,0.0
2,0.0
4,0.0
5,0.0
6,1.0
...,...
207253,0.0
207254,0.0
207255,0.0
207256,0.0


One hot encode categorical variables for train and test set 

In [87]:
oneHot_age_group =  pd.get_dummies(train_data['age_group'])

train_data = train_data.join(oneHot_age_group)
train_data  = train_data.drop('age_group', axis = 1)


In [88]:
oneHot_main_diag =  pd.get_dummies(train_data['mainDiagBlock'])

train_data = train_data.join(oneHot_main_diag)
train_data  = train_data.drop('mainDiagBlock', axis = 1)


In [89]:
oneHot_gender =  pd.get_dummies(train_data['gender'])

train_data = train_data.join(oneHot_gender)
train_data  = train_data.drop('gender', axis = 1)


In [91]:
oneHot_urgency =  pd.get_dummies(train_data['degree_of_urgency'],prefix=['planned','emergency'])

train_data = train_data.join(oneHot_urgency)
train_data  = train_data.drop('degree_of_urgency', axis = 1)



In [94]:
oneHot_age_group =  pd.get_dummies(test_data['age_group'])

test_data = test_data.join(oneHot_age_group)
test_data  = test_data.drop('age_group', axis = 1)


In [96]:
oneHot_main_diag =  pd.get_dummies(test_data['mainDiagBlock'])

test_data = test_data.join(oneHot_main_diag)
test_data  = test_data.drop('mainDiagBlock', axis = 1)


In [97]:
oneHot_gender =  pd.get_dummies(test_data['gender'])

test_data = test_data.join(oneHot_gender)
test_data  = test_data.drop('gender', axis = 1)


In [98]:
oneHot_urgency =  pd.get_dummies(test_data['degree_of_urgency'],prefix=['planned','emergency'])


test_data = test_data.join(oneHot_urgency)
test_data  = test_data.drop('degree_of_urgency', axis = 1)

In [129]:
# fit the model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
lr.fit(train_data.drop(['patientID',train_data.keys()[-1], train_data.keys()[-2]],axis = 1), train_y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')

In [120]:
# Evaluate a model on train and test set
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from collections import defaultdict


metrics_dic = {"Accuracy": accuracy_score,
               "Precision": precision_score,
               "Recall": recall_score,
               "F1-score": f1_score,
               "AUC Score": roc_auc_score}


def evaluate_model(model, metrics_dic, X_train, y_train, X_test, y_test, sample_weight=None):
    metrics = defaultdict(list)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    for metric_name, metric_fun in metrics_dic.items():
        metrics[metric_name].append(metric_fun(y_train, pred_train, sample_weight=sample_weight))
        metrics[metric_name].append(metric_fun(y_test, pred_test))

    metrics["Dataset"] = ["Train", "Test"]
    # Result as a dataframe
    res = pd.DataFrame()
    for key in list(metrics.keys()):
        res[key] = metrics[key]
    res.set_index("Dataset", inplace=True)
    return res

In [130]:
lr_metrics = evaluate_model(lr, metrics_dic, train_data.drop(['patientID',train_data.keys()[-1], train_data.keys()[-2]],axis=1), train_y, test_data.drop(['patientID',train_data.keys()[-1], train_data.keys()[-2]],axis=1), test_y)



In [131]:
lr_metrics

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-score,AUC Score
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Train,0.634859,0.132601,0.645494,0.220007,0.639716
Test,0.629394,0.129268,0.620859,0.213983,0.625504


In [132]:
lr_metrics.to_csv("lr_metrics_no_urgency.csv", index=False)

In [146]:
#Produce confusion matrix and sensitivity/specificity
from sklearn.metrics import confusion_matrix,classification_report

pred = lr.predict(drop_gender)

#print(classification_report(test_y,pred))

cm1 = confusion_matrix(test_y,pred)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1]) 
print('Sensitivity : ', sensitivity1 )  
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1]) 
print('Specificity : ', specificity1)

print(cm1)

Sensitivity :  0.6301485091077851
Specificity :  0.6208592981305346
[[21725 12751]
 [ 1156  1893]]


  Note that passing sample_weight=None will output an array of ones.
