In [143]:
# Dependencies
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sbn
import sklearn.linear_model as sklearnLinearModels
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [225]:
# Import and View
df = pd.read_csv("./Resources/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [226]:
df['race'] = df['race'].astype('category')
df['gender'] = df['gender'].astype('category')
df['race_new'] = df['race'].cat.codes
df['gen_new'] = df['gender'].cat.codes


In [227]:
enc = OneHotEncoder()
enc_df = pd.DataFrame(enc.fit_transform(df[['race_new', 'gen_new']]).toarray())
df = pd.concat([df, enc_df], axis=1)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,gen_new,0,1,2,3,4,5,6,7,8
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [228]:
# Clean dataset for readability.
# Remove columns with too many missing values
df = df.drop(columns=["weight", "payer_code", "medical_specialty"])
# Remove ID columns except for "patient_nbr"
df = df.drop(columns=["encounter_id", "admission_type_id", "discharge_disposition_id", "admission_source_id"])
# "weight" column is unusable data

# "group by patient_nbr"
df = df.groupby("patient_nbr").last().reset_index()
df

Unnamed: 0,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,gen_new,0,1,2,3,4,5,6,7,8
0,135,Caucasian,Female,[50-60),3,31,1,14,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,378,Caucasian,Female,[50-60),2,49,1,11,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,729,Caucasian,Female,[80-90),4,68,2,23,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,774,Caucasian,Female,[80-90),3,46,0,20,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,927,AfricanAmerican,Female,[30-40),5,49,0,5,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71513,189351095,Caucasian,Female,[80-90),1,73,1,11,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
71514,189365864,Other,Male,[60-70),3,56,1,8,0,0,...,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
71515,189445127,Caucasian,Female,[80-90),3,39,0,18,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
71516,189481478,Caucasian,Female,[40-50),14,69,0,16,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [229]:

# One-hot Encoding
df["readm_tf"] = df["readmitted"].apply(lambda x: True if x in ["<30", ">30"] else False)
#preEncodedDf = df.drop(columns = ["encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "payer_code", "medical_specialty", "readmitted", "readm_tf"])
#encodedDf = pd.get_dummies(preEncodedDf, columns = preEncodedDf.columns)
#encodedDf.head()

In [231]:
# Find correlation between medication and readmission

medications = df.columns[df.columns.str.contains("metformin|repaglinide|nateglinide|chlorpropamide|glimepiride|acetohexamide|glipizide|glyburide|tolbutamide|pioglitazone|rosiglitazone|acarbose|miglitol|troglitazone|tolazamide|examide|citoglipton|insulin|glyburide-metformin|glipizide-metformin|glimepiride-pioglitazone|metformin-rosiglitazone|metformin-pioglitazone").fillna(False)]
medicationDf_encoded = pd.get_dummies(df[medications]) 
med_correlation = medicationDf_encoded.corrwith(df["readm_tf"])
med_correlation = med_correlation.sort_values(ascending=False)
print(med_correlation)


pioglitazone_Steady            0.022661
glipizide_Steady               0.021732
insulin_Down                   0.018381
repaglinide_Steady             0.014618
metformin_No                   0.014117
                                 ...   
glipizide_No                  -0.025299
pioglitazone_No               -0.025593
examide_No                          NaN
citoglipton_No                      NaN
glimepiride-pioglitazone_No         NaN
Length: 69, dtype: float64


In [207]:
# drop meds with low correlation and 
#med_correlation = med_correlation[med_correlation > 0.01]
#medications = med_correlation.index
#medicationDf_encoded = medicationDf_encoded[medications]
#medicationDf_encoded

Unnamed: 0,pioglitazone_Steady,glipizide_Steady,insulin_Down,repaglinide_Steady,metformin_No,glipizide_Up,glimepiride_Steady,rosiglitazone_Steady,pioglitazone_Up,pioglitazone_Down,acarbose_Steady,readm_tf
0,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
71513,False,False,False,False,True,False,False,False,False,False,False,False
71514,False,False,False,False,True,False,False,False,False,False,False,False
71515,False,False,True,False,False,True,False,True,False,False,False,False
71516,False,False,True,False,False,False,False,False,False,False,False,True


In [232]:
# Encode procedures
procedureDf_encoded = df[['num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_diagnoses']]

procedureDf_encoded = pd.get_dummies(procedureDf_encoded)
procedureDf_encoded

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses
0,31,1,14,0,5
1,49,1,11,0,3
2,68,2,23,0,9
3,46,0,20,0,9
4,49,0,5,0,3
...,...,...,...,...,...
71513,73,1,11,0,9
71514,56,1,8,0,7
71515,39,0,18,0,9
71516,69,0,16,0,5


In [233]:
# Encode diagnosis
diagnosisDf = df[['diag_1', 'diag_2', 'diag_3']]


diagnosisDf_encoded = pd.get_dummies(diagnosisDf)

In [235]:
# Merge the dataframes
mergedDf = pd.concat([medicationDf_encoded, procedureDf_encoded], axis = 1)
mergedDf = pd.concat([procedureDf_encoded, diagnosisDf_encoded], axis = 1)
mergedDf.head()

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses,diag_1_10,diag_1_11,diag_1_110,diag_1_112,diag_1_114,...,diag_3_V61,diag_3_V62,diag_3_V63,diag_3_V64,diag_3_V65,diag_3_V66,diag_3_V70,diag_3_V72,diag_3_V85,diag_3_V86
0,31,1,14,0,5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,49,1,11,0,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,68,2,23,0,9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,46,0,20,0,9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,49,0,5,0,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# correlMatrix = encodedDf.corr()
# print(correlMatrix)

# COMMENTED OUT TO SAVE YOU TIME WHEN RUNNING ALL THE CELLS.
# ONLY RUN IF YOU NEED IT

In [236]:
# Logistic Regression Model and import libraries
import sklearn as skl
import sklearn.linear_model as sklearnLinearModels
import sklearn.model_selection

import sklearn.metrics

X = mergedDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = skl.model_selection.train_test_split(X, Y, test_size=0.2, random_state=5)

model = sklearnLinearModels.LogisticRegression(max_iter = 1000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", skl.metrics.accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", skl.metrics.classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", skl.metrics.confusion_matrix(Y_test, Y_pred))

Accuracy: 0.758668903803132
Classification Report:
               precision    recall  f1-score   support

       False       0.76      0.99      0.86     10846
        True       0.51      0.03      0.06      3458

    accuracy                           0.76     14304
   macro avg       0.64      0.51      0.46     14304
weighted avg       0.70      0.76      0.67     14304

Confusion Matrix:
 [[10748    98]
 [ 3354   104]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
