In [143]:
# Dependencies
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sbn
import sklearn.linear_model as sklearnLinearModels
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [165]:
# Import and View
df = pd.read_csv("./Resources/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [145]:
# Display more information on the dataset
df.info()
print()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [154]:
# Clean dataset for readability.
# Remove columns with too many missing values
df = df.drop(columns=["weight", "payer_code", "medical_specialty"])
# Remove ID columns except for "patient_nbr"
df = df.drop(columns=["encounter_id", "admission_type_id", "discharge_disposition_id", "admission_source_id"])
# "weight" column is unusable data

# "group by patient_nbr"
df = df.groupby("patient_nbr").last().reset_index()
df

Unnamed: 0,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,135,Caucasian,Female,[50-60),3,31,1,14,0,0,...,No,No,No,No,No,No,No,Ch,Yes,>30
1,378,Caucasian,Female,[50-60),2,49,1,11,0,0,...,No,No,No,No,No,No,No,No,No,NO
2,729,Caucasian,Female,[80-90),4,68,2,23,0,0,...,No,No,No,No,No,No,No,No,Yes,NO
3,774,Caucasian,Female,[80-90),3,46,0,20,0,0,...,No,No,No,No,No,No,No,Ch,Yes,NO
4,927,AfricanAmerican,Female,[30-40),5,49,0,5,0,0,...,No,No,No,No,No,No,No,No,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71513,189351095,Caucasian,Female,[80-90),1,73,1,11,0,0,...,No,No,No,No,No,No,No,No,No,NO
71514,189365864,Other,Male,[60-70),3,56,1,8,0,0,...,No,Steady,No,No,No,No,No,No,Yes,NO
71515,189445127,Caucasian,Female,[80-90),3,39,0,18,0,0,...,No,Down,No,No,No,No,No,Ch,Yes,NO
71516,189481478,Caucasian,Female,[40-50),14,69,0,16,0,0,...,No,Down,No,No,No,No,No,Ch,Yes,>30


In [166]:
# One-hot Encoding
df["readm_tf"] = df["readmitted"].apply(lambda x: True if x in ["<30", ">30"] else False)
#preEncodedDf = df.drop(columns = ["encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "payer_code", "medical_specialty", "readmitted", "readm_tf"])
#encodedDf = pd.get_dummies(preEncodedDf, columns = preEncodedDf.columns)
#encodedDf.head()

In [172]:
# Medications One-hot Encoding
meds = df[["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]]
# Encode the medications
medicationDf_encoded = pd.get_dummies(meds)

medicationDf_encoded



Unnamed: 0,metformin_Down,metformin_No,metformin_Steady,metformin_Up,repaglinide_Down,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_Down,nateglinide_No,...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady
0,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
1,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
2,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
3,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
4,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,False,False,True,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
101762,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
101763,False,False,True,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False
101764,False,True,False,False,False,True,False,False,False,True,...,False,False,True,False,True,False,True,False,True,False


In [None]:
#medication analysis
meds

In [173]:
# Encode procedures
procedureDf_encoded = df[['num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_diagnoses']]

procedureDf_encoded = pd.get_dummies(procedureDf_encoded)
procedureDf_encoded

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses
0,41,0,1,0,1
1,59,0,18,0,9
2,11,5,13,0,6
3,44,1,16,0,7
4,51,0,8,0,5
...,...,...,...,...,...
101761,51,0,16,0,9
101762,33,3,18,0,9
101763,53,0,9,0,13
101764,45,2,21,0,9


In [170]:
# Encode diagnosis
diagnosisDf = df[['diag_1', 'diag_2', 'diag_3','readm_tf']]


diagnosisDf_encoded = pd.get_dummies(diagnosisDf)

In [10]:
# Merge the dataframes
mergedDf = pd.merge(medicationDf_encoded, procedureDf_encoded, on = "readm_tf")
#mergedDf = pd.merge(mergedDf, diagnosisDf_encoded, on = "change")
mergedDf.head()

In [11]:
# correlMatrix = encodedDf.corr()
# print(correlMatrix)

# COMMENTED OUT TO SAVE YOU TIME WHEN RUNNING ALL THE CELLS.
# ONLY RUN IF YOU NEED IT

In [12]:
# Logistic Regression Model and import libraries
import sklearn as skl
import sklearn.linear_model as sklearnLinearModels
import sklearn.model_selection

import sklearn.metrics

X = mergedDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = skl.model_selection.train_test_split(X, Y, test_size=0.2, random_state=5)

model = sklearnLinearModels.LogisticRegression(max_iter = 1000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", skl.metrics.accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", skl.metrics.classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", skl.metrics.confusion_matrix(Y_test, Y_pred))

NameError: name 'mergedDf' is not defined