In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [3]:
# source = https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008
file_url = "https://raw.githubusercontent.com/Tobbs11/Diabetes-readmission-Project/main/diabetic_data.csv"
diabetes_df = pd.read_csv(file_url)

In [5]:
diabetes_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes_df.shape, diabetes_df.columns

((101766, 50),
 Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
        'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
        'time_in_hospital', 'payer_code', 'medical_specialty',
        'num_lab_procedures', 'num_procedures', 'num_medications',
        'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
        'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
        'tolazamide', 'examide', 'citoglipton', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
       dtype='object'))

In [5]:
diabetes_df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [6]:
# find number of missing values
diabetes_df.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [7]:
# drop highly missing and irrelevant columns
columns_to_drop = ['encounter_id', 'weight', 'payer_code', 'medical_specialty', 'max_glu_serum']
diabetes_df.drop(columns=columns_to_drop, inplace=True)

In [8]:
# Replace missing values
diabetes_df.replace('?', np.nan, inplace=True)
diabetes_df['race'].fillna('Other', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_df['race'].fillna('Other', inplace=True)


In [9]:
admission_type_id = { 
    1 : 'Emergency',
    2 : 'Urgent',
    3 : 'Elective',
    4 : 'Newborn',
    5 : 'Not Available',
    6 : 'NULL',
    7 : 'Trauma Center',
    8 : 'Not Mapped' 
}

In [10]:
discharge_disposition_id = { 
    1 : 'Discharged to home',
    2 : 'Discharged/transferred to another short term hospital',
    3 : 'Discharged/transferred to SNF',
    4 : 'Discharged/transferred to ICF',
    5 : 'Discharged/transferred to another type of inpatient care institution',
    6 : 'Discharged/transferred to home with home health service',
    7 : 'Left AMA', 
    8 : 'Discharged/transferred to home under care of Home IV provider', 
    9 : 'Admitted as an inpatient to this hospital', 
    10 : 'Neonate discharged to another hospital for neonatal aftercare',
    11 : 'Expired',
    12 : 'Still patient or expected to return for outpatient services',
    13 : 'Hospice / home',
    14 : 'Hospice / medical facility',
    15 : 'Discharged/transferred within this institution to Medicare approved swing bed',
    16 : 'Discharged/transferred/referred another institution for outpatient services', 
    17 : 'Discharged/transferred/referred to this institution for outpatient services',
    18 : 'NULL',
    19 : 'Expired at home. Medicaid only, hospice',
    20 : 'Expired in a medical facility. Medicaid only, hospice',
    21 : 'Expired, place unknown. Medicaid only, hospice',
    22 : 'Discharged/transferred to another rehab fac including rehab units of a hospital',
    23 : 'Discharged/transferred to a long term care hospital',
    24 : 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare',
    25 : 'Not Mapped',
    26 : 'Unknown/Invalid',
    27 : 'Discharged/transferred to a federal health care facility',
    28 : 'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital',
    29 : 'Discharged/transferred to a Critical Access Hospital (CAH)' ,
    30 : 'Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere',
}

In [11]:
admission_source_id = { 
    1 : 'Physician Referral', 
    2 : 'Clinic Referral', 
    3 : 'HMO Referral',
    4 : 'Transfer from a hospital',
    5 : 'Transfer from a Skilled Nursing Facility (SNF)', 
    6 : 'Transfer from another health care facility',
    7 : 'Emergency Room',
    8 : 'Court/Law Enforcement',
    9 :  'Not Available',
    10 : 'Transfer from critial access hospital',
    11 : 'Normal Delivery', 12 : 'Premature Delivery', 13 : 'Sick Baby', 14 : 'Extramural Birth',
    15 : 'Not Available', 17 : 'NULL', 18 : 'Transfer From Another Home Health Agency',
    19 : 'Readmission to Same Home Health Agency', 20 : 'Not Mapped', 21 : 'Unknown/Invalid',
    22 : 'Transfer from hospital inpt/same fac reslt in a sep claim',
    23 : 'Born inside this hospital', 24 : 'Born outside this hospital',
    25 : 'Transfer from Ambulatory Surgery Center',
    26 : 'Transfer from Hospice'
}

In [12]:
import math

def map_icd9(code):
    """
    ICD9 diagnosis codes
    https://onlinelibrary.wiley.com/doi/10.1155/2014/781670
    """
    try:
        code = float(code)
    except:
        return 'Unknown'
    if math.isnan(code):
        return 'Unknown'
    elif (390 <= code <= 459) or (code == 785):
        return 'Circulatory'
    elif (460 <= code <= 519) or (code == 786):
        return 'Respiratory'
    elif (520 <= code <= 579) or (code == 787):
        return 'Digestive'
    elif int(code) == 250:
        return 'Diabetes'
    elif 800 <= code <= 999:
        return 'Injury'
    elif 710 <= code <= 739:
        return 'Musculoskeletal'
    elif 580 <= code <= 629:
        return 'Genitourinary'
    elif 140 <= code <= 239:
        return 'Neoplasms'
    elif 290 <= code <= 319:
        return 'MentalDisorders'
    else:
        return 'Other'

In [13]:
# Apply mapping 
for col in ['diag_1', 'diag_2', 'diag_3']:
    diabetes_df[col] = diabetes_df[col].apply(map_icd9)
    
diabetes_df['admission_type'] = diabetes_df['admission_type_id'].map(admission_type_id)
diabetes_df['discharge_disposition'] = diabetes_df['discharge_disposition_id'].map(discharge_disposition_id)
diabetes_df['admission_source'] = diabetes_df['admission_source_id'].map(admission_source_id)

In [14]:
# Simplify A1C result to binary (hba1c measured or not)
diabetes_df['hba1c_measured'] = diabetes_df['A1Cresult'].apply(lambda x: 0 if x == 'None' else 1)

In [15]:
# Since we are primarily interested in readmission (early or not), 
# we define the readmission attribute (outcome) as binary:
# 1, if the patient was readmitted < or > 30 days of discharge or
# 0, no readmission at all.
diabetes_df['readmitted_binary'] = diabetes_df['readmitted'].apply(lambda x: 0 if x ==
                                                                   'NO' else 1)

In [16]:
# drop redefined columns
diabetes_df.drop(
    columns=['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'A1Cresult', 'readmitted'], 
    inplace=True
)

In [17]:
diabetes_df.columns

Index(['patient_nbr', 'race', 'gender', 'age', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'admission_type', 'discharge_disposition',
       'admission_source', 'hba1c_measured', 'readmitted_binary'],
      dtype='object')

### Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# check for int32, int64, and float
X = diabetes_df.drop(['readmitted_binary'], axis=1)
y = diabetes_df['readmitted_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = [col for col in X.columns if  X[col].dtype.kind in 'iuf']
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=True, handle_unknown='ignore'), categorical_features)
    ]
)

In [20]:
# Evaluate
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit
pipeline.fit(X_train, y_train)
print("Done training, predicting..")
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))

Done training, predicting..
              precision    recall  f1-score   support

           0       0.65      0.73      0.69     10952
           1       0.63      0.55      0.59      9402

    accuracy                           0.64     20354
   macro avg       0.64      0.64      0.64     20354
weighted avg       0.64      0.64      0.64     20354

AUC: 0.6989590990851146
