In [2]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import plotly.express as px
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
import warnings

In [3]:
# Import and View
df = pd.read_csv("./Resources/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
# Display more information on the dataset
df.info()
print()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
# Clean dataset for readability.
# Remove any errors or faulty data.

# "weight" column is unusable data
df = df.drop(columns=["weight"])

# 

In [6]:
# One-hot Encoding
df["readm_tf"] = df["readmitted"].apply(lambda x: True if x in ["<30", ">30"] else False)
preEncodedDf = df.drop(columns = ["encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "payer_code", "medical_specialty", "readmitted", "readm_tf"])
encodedDf = pd.get_dummies(preEncodedDf, columns = preEncodedDf.columns)
encodedDf.head()

Unnamed: 0,race_?,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_[0-10),...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,False,False,False,True,False,False,True,False,False,True,...,True,False,True,False,True,False,False,True,True,False
1,False,False,False,True,False,False,True,False,False,False,...,True,False,True,False,True,False,True,False,False,True
2,False,True,False,False,False,False,True,False,False,False,...,True,False,True,False,True,False,False,True,False,True
3,False,False,False,True,False,False,False,True,False,False,...,True,False,True,False,True,False,True,False,False,True
4,False,False,False,True,False,False,False,True,False,False,...,True,False,True,False,True,False,True,False,False,True


In [7]:
# correlMatrix = encodedDf.corr()
# print(correlMatrix)

# COMMENTED OUT TO SAVE YOU TIME WHEN RUNNING ALL THE CELLS.
# ONLY RUN IF YOU NEED IT

In [25]:
# Logistic Regression Model
X = encodedDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))
coeffList = list(model.coef_[0])
for coeff in coeffList:
    print(coeff)

Accuracy: 0.6338311879728801
Classification Report:
               precision    recall  f1-score   support

       False       0.64      0.74      0.69     10969
        True       0.63      0.51      0.56      9385

    accuracy                           0.63     20354
   macro avg       0.63      0.62      0.62     20354
weighted avg       0.63      0.63      0.63     20354

Confusion Matrix:
 [[8109 2860]
 [4593 4792]]
-0.20684275899211307
0.1658564405608606
-0.06108233945099808
0.22751802463782117
0.036589780701154574
-0.014214903364486595
0.2902358253132286
0.21548241867521403
-0.3578939998962067
-0.7825253537070825
0.10826876230875808
0.09635749509193194
0.08400953742856865
0.1122436459580692
0.10460195432398442
0.17079782765229226
0.2255683649987189
0.18397216892266693
-0.15547015888567764
-0.13653783689410476
-0.04965451529444147
-0.048574453460184075
0.020435110385792526
-0.01494590314977179
0.04019037624403854
0.02454743974204035
0.07391774177240566
0.08587568321300164
0.1420

In [40]:
# Only Using Coefficients With A Value Greater Than Or Equal To 1
coeffDict = {f"Coeff_{i}": coef for i, coef in enumerate(coeffList)}
print(coeffDict)
sortedCoeffDict = dict(sorted(coeffDict.items(), key=lambda item: item[1], reverse=True))
print(sortedCoeffDict)

culledCoeffDict = {key: value for key, value in sortedCoeffDict.items() if value >= 1}
print(culledCoeffDict)

listOfFeatureIDs = [int(key.replace("Coeff_", "")) for key in culledCoeffDict.keys()]
for id in listOfFeatureIDs:
    print(id)

newDf = encodedDf.iloc[:, listOfFeatureIDs]
newDf.head()

{'Coeff_0': -0.20684275899211307, 'Coeff_1': 0.1658564405608606, 'Coeff_2': -0.06108233945099808, 'Coeff_3': 0.22751802463782117, 'Coeff_4': 0.036589780701154574, 'Coeff_5': -0.014214903364486595, 'Coeff_6': 0.2902358253132286, 'Coeff_7': 0.21548241867521403, 'Coeff_8': -0.3578939998962067, 'Coeff_9': -0.7825253537070825, 'Coeff_10': 0.10826876230875808, 'Coeff_11': 0.09635749509193194, 'Coeff_12': 0.08400953742856865, 'Coeff_13': 0.1122436459580692, 'Coeff_14': 0.10460195432398442, 'Coeff_15': 0.17079782765229226, 'Coeff_16': 0.2255683649987189, 'Coeff_17': 0.18397216892266693, 'Coeff_18': -0.15547015888567764, 'Coeff_19': -0.13653783689410476, 'Coeff_20': -0.04965451529444147, 'Coeff_21': -0.048574453460184075, 'Coeff_22': 0.020435110385792526, 'Coeff_23': -0.01494590314977179, 'Coeff_24': 0.04019037624403854, 'Coeff_25': 0.02454743974204035, 'Coeff_26': 0.07391774177240566, 'Coeff_27': 0.08587568321300164, 'Coeff_28': 0.14206785482712314, 'Coeff_29': -0.021762830006342295, 'Coeff_30

Unnamed: 0,diag_1_643,number_inpatient_11,diag_3_250.91,diag_1_282,diag_2_513,diag_1_583,diag_3_359,number_emergency_10,diag_2_E947,diag_3_712,...,diag_2_826,diag_3_481,diag_2_565,diag_1_526,diag_1_5,diag_1_359,diag_2_250.93,diag_1_445,diag_3_709,diag_2_E936
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [46]:
# Re-attempt Logistic Regression With "Better" Features
X = newDf
Y = df["readm_tf"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Classification Report:\n", classification_report(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

Accuracy: 0.5410730077626019
Classification Report:
               precision    recall  f1-score   support

       False       0.54      1.00      0.70     10969
        True       0.68      0.01      0.02      9385

    accuracy                           0.54     20354
   macro avg       0.61      0.50      0.36     20354
weighted avg       0.61      0.54      0.39     20354

Confusion Matrix:
 [[10931    38]
 [ 9303    82]]


In [9]:
# Decision Trees Model "gini"
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = DecisionTreeClassifier(criterion="gini", random_state=5, max_depth=3, min_samples_leaf=5)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.620909894860961


In [10]:
# Decision Trees Model "entropy"
X = encodedDf
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

model = DecisionTreeClassifier(criterion="entropy", random_state=5, max_depth=3, min_samples_leaf=5)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6212046772133242
