In [1]:
# Dependencies
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sbn
import sklearn.linear_model as sklearnLinearModels
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [91]:
# Import and View
df = pd.read_csv("./Resources/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [89]:
# Display more information on the dataset
df.info()
print()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [92]:
# Clean dataset for readability.
# Remove any errors or faulty data.

# "weight" column is unusable data
df = df.drop(columns=["weight"])

# 

In [93]:
# One-hot Encoding
df["readm_tf"] = df["readmitted"].apply(lambda x: True if x in ["<30", ">30"] else False)
#preEncodedDf = df.drop(columns = ["encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "payer_code", "medical_specialty", "readmitted", "readm_tf"])
#encodedDf = pd.get_dummies(preEncodedDf, columns = preEncodedDf.columns)
#encodedDf.head()

In [95]:
# Medications One-hot Encoding
meds = df[["patient_nbr","readm_tf","metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]]
# Optimize
medicationDf_encoded = meds.applymap(lambda x: 1 if x in ["Down", "Up", "Steady"] else 0)

medicationDf_encoded



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000207868E1990>

In [62]:
# Reset index and patient number
medicationDf_encoded = medicationDf_encoded.reset_index()
medicationDf_encoded = medicationDf_encoded.drop(columns = ["patient_nbr"])
medicationDf_encoded = pd.get_dummies(medicationDf_encoded)
medicationDf_encoded

Unnamed: 0,readm_tf,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,...,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone
0,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,False,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,False,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,False,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
101762,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
101763,False,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
101764,False,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
# Create dataframes for the medications and encode
#medications = ["change","patient_nbr","metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone", "readm_tf"]
#medicationDf = df[medications]
#medicationDf_encoded = pd.get_dummies(medicationDf)
#medicationDf_encoded

In [32]:
# Encode procedures
procedures = ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_diagnoses', 'patient_nbr', 'readm_tf']
procedureDf = df[procedures]
# Group by patient number
procedureDf_encoded = procedureDf.groupby("patient_nbr").sum()
# Encode the readmission status
procedureDf_encoded["readm_tf"] = procedureDf_encoded["readm_tf"].apply(lambda x: True if x > 0 else False)
# Drop the patient number
procedureDf_encoded = procedureDf_encoded.drop(columns=["patient_nbr"])
procedureDf_encoded = pd.get_dummies(procedureDf)
procedureDf_encoded

KeyError: "['patient_nbr'] not found in axis"

In [21]:
# Encode diagnosis
diagnosis = ['diag_1', 'diag_2', 'diag_3','readm_tf']
diagnosisDf = df[diagnosis]
diagnosisDf_encoded = pd.get_dummies(diagnosisDf)

In [10]:
# Merge the dataframes
mergedDf = pd.merge(medicationDf_encoded, procedureDf_encoded, on = "readm_tf")
#mergedDf = pd.merge(mergedDf, diagnosisDf_encoded, on = "change")
mergedDf.head()

In [11]:
# correlMatrix = encodedDf.corr()
# print(correlMatrix)

# COMMENTED OUT TO SAVE YOU TIME WHEN RUNNING ALL THE CELLS.
# ONLY RUN IF YOU NEED IT

In [12]:
# Logistic Regression Model and import libraries
import sklearn as skl
import sklearn.linear_model as sklearnLinearModels
import sklearn.model_selection

import sklearn.metrics

X = mergedDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = skl.model_selection.train_test_split(X, Y, test_size=0.2, random_state=5)

model = sklearnLinearModels.LogisticRegression(max_iter = 1000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", skl.metrics.accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", skl.metrics.classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", skl.metrics.confusion_matrix(Y_test, Y_pred))

NameError: name 'mergedDf' is not defined