# Import Main Packagae and Data

In [221]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate   #break up dataset into train and test sets
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

df = pd.read_csv('diabetic_data.csv')
df


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,MC,?,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,MC,?,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,MC,?,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [222]:
pd.set_option('display.max_columns', 100)

# Checking for and Addressing Missingness

In [223]:
# Drop Patient Identifiers
df.drop(['encounter_id', 'patient_nbr'], axis = 1, inplace = True)

# The dataset denotes missing values with the '?'.
df = df.replace('?', np.nan)

# Dropping some columns that have large percentages of missing values.
df.drop(['weight','medical_specialty', 'payer_code'], axis = 1, inplace = True)

# According to the variable table/data dictionary, the 'A1Cresult' and 'max_glu_serum' do not have missing values. The 'nan' values in two columns indicate that no measurements were taken.
df[['A1Cresult', 'max_glu_serum']] = df[['A1Cresult', 'max_glu_serum']].fillna('not_measured')

# Dropping any additional rows with missing values.
df.dropna(inplace=True)

df


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,0,0,0,276,250.01,255,9,not_measured,not_measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,2,0,1,648,250,V27,6,not_measured,not_measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,0,0,0,8,250.43,403,7,not_measured,not_measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197,157,250,5,not_measured,not_measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,414,411,250,9,not_measured,not_measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,0,0,0,250.13,291,458,9,not_measured,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,0,0,1,560,276,787,9,not_measured,not_measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
101763,Caucasian,Male,[70-80),1,1,7,1,53,0,9,1,0,0,38,590,296,13,not_measured,not_measured,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,Caucasian,Female,[80-90),2,3,7,10,45,2,21,0,0,1,996,285,998,9,not_measured,not_measured,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [224]:
# Limitig the DataFrame to columns that may be important to our model
df = df.drop(columns=['diag_2', 'diag_3','metformin', 'repaglinide', 'nateglinide', 'chlorpropamide','glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone','tolazamide', 'examide', 'citoglipton','glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone'])

# Encoding Categorical Values

In [225]:
df['readmitted'] = df['readmitted'].replace({'NO':0, '>30':0, '<30':1}).astype(int) # 1 = readmitted within 30 days

df['A1Cresult'] = df['A1Cresult'].replace({'not_measured':0, 'Norm':1, '>7':2, '>8':3}).astype(int)

df['diabetesMed'] = df['diabetesMed'].replace({'No':0, 'Yes':1}).astype(int)

df['insulin'] = df['insulin'].replace({'Down':1, 'No':2, 'Steady':3, 'Up':4}).astype(int)

race_dict = {'Caucasian':0,'AfricanAmerican':1,'Hispanic':2,'Other':3,'Asian':4}
df['race'] = df['race'].replace(race_dict)

gender_dict = {'Male':0, 'Female':1, 'Unknown/Invalid':2}
df['gender'] = df['gender'].replace(gender_dict)

age_dict = {'[70-80)':0,'[60-70)':1,'[80-90)':2,'[50-60)':3,'[40-50)':4,'[30-40)':5,'[90-100)':6,'[20-30)':7,'[10-20)':8,'[0-10)':9}
df['age'] = df['age'].replace(age_dict)

max_glu_dict = {'not_measured': 0,'Norm': 1,'>200': 2,'>300': 3}
df['max_glu_serum'] = df['max_glu_serum'].replace(max_glu_dict)

df['change'] = df['change'].replace({'No': 0, 'Ch':1})

  df['readmitted'] = df['readmitted'].replace({'NO':0, '>30':0, '<30':1}).astype(int) # 1 = readmitted within 30 days
  df['A1Cresult'] = df['A1Cresult'].replace({'not_measured':0, 'Norm':1, '>7':2, '>8':3}).astype(int)
  df['diabetesMed'] = df['diabetesMed'].replace({'No':0, 'Yes':1}).astype(int)
  df['insulin'] = df['insulin'].replace({'Down':1, 'No':2, 'Steady':3, 'Up':4}).astype(int)
  df['race'] = df['race'].replace(race_dict)
  df['gender'] = df['gender'].replace(gender_dict)
  df['age'] = df['age'].replace(age_dict)
  df['max_glu_serum'] = df['max_glu_serum'].replace(max_glu_dict)
  df['change'] = df['change'].replace({'No': 0, 'Ch':1})


In [226]:
# This column contains Diagnosis codes and there are too many to encode manually.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['diag_1'] = le.fit_transform(df['diag_1'])

In [227]:
#df.drop('diag_1', inplace=True, axis=1)

In [228]:
y = df['readmitted']
X = df.copy()
X = X.drop('readmitted', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=42)


In [229]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Fit the model
lr = LogisticRegression(random_state=42, solver='newton-cg', max_iter=200)
lr.fit(X_train, y_train)

# Make predictions (using predict instead of predict_proba for evaluation)
y_preds = lr.predict(X_test)

# Print evaluation metrics
print('Metrics for Validation data:')
print('Area Under Curve:', '%.3f' % roc_auc_score(y_test, y_preds))
print('Accuracy:', '%.3f' % accuracy_score(y_test, y_preds)) 
print('Precision:', '%.3f' % precision_score(y_test, y_preds)) # False Positives
print('Recall:', '%.3f' % recall_score(y_test, y_preds)) # False Negatives
print('F1 Score:', '%.3f' % f1_score(y_test, y_preds))


Metrics for Validation data:
Area Under Curve: 0.508
Accuracy: 0.887
Precision: 0.485
Recall: 0.017
F1 Score: 0.033


In [230]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Get the predictions on test data
y_preds = gnb.predict(X_test)

print('Metrics for Validation data:')
print('Area Under Curve:', '%.3f' % roc_auc_score(y_test, y_preds))
print('Accuracy:', '%.3f' % accuracy_score(y_test, y_preds)) 
print('Precision:', '%.3f' % precision_score(y_test, y_preds)) # False Positives
print('Recall:', '%.3f' % recall_score(y_test, y_preds)) # False Negatives
print('F1 Score:', '%.3f' % f1_score(y_test, y_preds))

Metrics for Validation data:
Area Under Curve: 0.540
Accuracy: 0.854
Precision: 0.241
Recall: 0.135
F1 Score: 0.173


In [231]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train a model with class weighting
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
print('Metrics for Validation data:')
print('Area Under Curve:', '%.3f' % roc_auc_score(y_test, y_preds))
print('Accuracy:', '%.3f' % accuracy_score(y_test, y_preds)) 
print('Precision:', '%.3f' % precision_score(y_test, y_preds)) # False Positives
print('Recall:', '%.3f' % recall_score(y_test, y_preds)) # False Negatives
print('F1 Score:', '%.3f' % f1_score(y_test, y_preds))

Metrics for Validation data:
Area Under Curve: 0.540
Accuracy: 0.854
Precision: 0.241
Recall: 0.135
F1 Score: 0.173
