In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("diabetic_data.csv")
df.shape

(101766, 50)

In [4]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
# dropping the identifiers
identifier_cols = ['encounter_id', 'patient_nbr']
df.drop(columns = identifier_cols, inplace = True)

In [6]:
df.shape  #started with 50 cols, now have 48

(101766, 48)

##### Redefining the target variable "readmitted" as a binary classifier as Positive class(<30) and Negative class(>30, NO) instead of a multiclass classifier

In [7]:
df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)
df.drop(columns=['readmitted'], inplace = True)

In [8]:
df['readmitted_binary'].value_counts()

readmitted_binary
0    90409
1    11357
Name: count, dtype: int64

In [9]:
df.shape

(101766, 48)

In [10]:
#checking number of missing values in each column
(df == '?').sum().sort_values(ascending=False)

weight                      98569
medical_specialty           49949
payer_code                  40256
race                         2273
diag_3                       1423
diag_2                        358
diag_1                         21
tolazamide                      0
glyburide                       0
tolbutamide                     0
pioglitazone                    0
rosiglitazone                   0
acarbose                        0
miglitol                        0
troglitazone                    0
citoglipton                     0
examide                         0
acetohexamide                   0
insulin                         0
glyburide-metformin             0
glipizide-metformin             0
glimepiride-pioglitazone        0
metformin-rosiglitazone         0
metformin-pioglitazone          0
change                          0
diabetesMed                     0
glipizide                       0
nateglinide                     0
glimepiride                     0
chlorpropamide

In [11]:
# Dropping the columns containing missing values around 50%

high_missing_val_cols = ['weight', 'medical_specialty', 'payer_code']

df.drop(columns= high_missing_val_cols, inplace = True)

In [12]:
df.shape   #dropped 3 more cols and left with 45 cols

(101766, 45)

In [13]:
df.sample(5)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_binary
52347,Caucasian,Male,[60-70),3,1,1,8,43,6,32,...,No,Up,No,No,No,No,No,Ch,Yes,0
54172,Asian,Female,[70-80),3,3,1,6,33,4,16,...,No,Up,No,No,No,No,No,Ch,Yes,1
95361,?,Male,[70-80),3,3,1,4,48,0,12,...,No,Steady,No,No,No,No,No,No,Yes,1
36063,?,Female,[60-70),2,3,4,14,32,0,14,...,No,No,No,No,No,No,No,Ch,Yes,0
464,Caucasian,Female,[60-70),6,25,4,9,70,6,33,...,No,Steady,No,No,No,No,No,Ch,Yes,0


##### What do `diag_1`, `diag_2`, and `diag_3` represent?

- `diag_1` represents the **primary diagnosis**, which is the main clinical reason for the patient’s hospital admission.
- `diag_2` and `diag_3` represent **secondary diagnoses**, capturing comorbid conditions identified during the hospital stay.

In this phase, we retain only `diag_1` because it directly reflects the primary reason for admission.  
Since diagnosis codes are ICD-9 codes with very high cardinality (thousands of possible values), we postpone deeper diagnosis processing (such as grouping or mapping) to later iterations of the project.


In [14]:
print(df.columns)  # all the columns we're left with

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_binary'],
      dtype='object')


##### Demographics
- `race`
- `gender`
- `age`
##### Encounter context (admission and discharge information)
- `admission_type_id`
- `admission_source_id`
- `discharge_disposition_id`
##### Prior healthcare utilization
- `number_outpatient`
- `number_emergency`
- `number_inpatient`
##### Hospital stay summary
- `time_in_hospital`
- `num_lab_procedures`
- `num_procedures`
- `num_medications`
##### Diagnosis information
- `diag_1`
##### Diabetes medication signals
- `change`
- `diabetesMed`
