In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import plotly.express as px
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
import warnings

In [2]:
# Import and View
df = pd.read_csv("./Resources/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
# Display more information on the dataset
df.info()
print()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [4]:
# Clean dataset for readability.
# Remove any errors or faulty data.

# "weight" column is unusable data
df = df.drop(columns=["weight"])

# 

In [5]:
# One-hot Encoding
df["readm_tf"] = df["readmitted"].apply(lambda x: True if x in ["<30", ">30"] else False)
preEncodedDf = df.drop(columns = ["encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "payer_code", "medical_specialty", "readmitted", "readm_tf"])
encodedDf = pd.get_dummies(preEncodedDf, columns = preEncodedDf.columns)
encodedDf.head()

Unnamed: 0,race_?,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_[0-10),...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,False,False,False,True,False,False,True,False,False,True,...,True,False,True,False,True,False,False,True,True,False
1,False,False,False,True,False,False,True,False,False,False,...,True,False,True,False,True,False,True,False,False,True
2,False,True,False,False,False,False,True,False,False,False,...,True,False,True,False,True,False,False,True,False,True
3,False,False,False,True,False,False,False,True,False,False,...,True,False,True,False,True,False,True,False,False,True
4,False,False,False,True,False,False,False,True,False,False,...,True,False,True,False,True,False,True,False,False,True


In [6]:
# correlMatrix = encodedDf.corr()
# print(correlMatrix)

# COMMENTED OUT TO SAVE YOU TIME WHEN RUNNING ALL THE CELLS.
# ONLY RUN IF YOU NEED IT

In [22]:
# Logistic Regression Model
X = encodedDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

Accuracy: 0.6299899280222074
Classification Report:
               precision    recall  f1-score   support

       False       0.63      0.74      0.68     21891
        True       0.62      0.51      0.56     18816

    accuracy                           0.63     40707
   macro avg       0.63      0.62      0.62     40707
weighted avg       0.63      0.63      0.62     40707

Confusion Matrix:
 [[16117  5774]
 [ 9288  9528]]


In [27]:
# Decision Trees Model "gini"
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = DecisionTreeClassifier(criterion="gini", random_state=5, max_depth=3, min_samples_leaf=5)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.620909894860961


In [28]:
# Decision Trees Model "entropy"
X = encodedDf
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = DecisionTreeClassifier(criterion="entropy", random_state=5, max_depth=3, min_samples_leaf=5)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6212046772133242
