In [50]:
from pathlib import Path
import pandas as pd

project_root = Path(".")  # repo root
data_raw_path = project_root / "data" / "raw" / "diabetic_data.csv"
data_processed_dir = project_root / "data" / "processed"
data_processed_dir.mkdir(parents=True, exist_ok=True)

print("Raw path:", data_raw_path)
print("Exists:", data_raw_path.exists())

df = pd.read_csv(data_raw_path)
print("Shape:", df.shape)
df.head()

Raw path: data/raw/diabetic_data.csv
Exists: True
Shape: (101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


Clean columns (drop + fill)

In [51]:
import pandas as pd
import numpy as np

print(df.shape)
df.columns

(101766, 50)


Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [52]:
# Drop columns with too many missing values
cols_to_drop = ["weight", "payer_code"]
df = df.drop(columns=cols_to_drop)

# Fill medical_specialty and race
df["medical_specialty"] = df["medical_specialty"].fillna("Unknown")
df["race"] = df["race"].fillna("Unknown")

print("Remaining columns:", len(df.columns))
df.isnull().sum().sort_values(ascending=False).head()

Remaining columns: 48


max_glu_serum    96420
A1Cresult        84748
encounter_id         0
troglitazone         0
acetohexamide        0
dtype: int64

Create binary target readmitted_30

In [53]:
# if readmitted within 30 days, else 0
df["readmitted_30"] = (df["readmitted"] == "<30").astype(int)

print(df["readmitted"].value_counts())
print(df["readmitted_30"].value_counts(normalize=True))

readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64
readmitted_30
0    0.888401
1    0.111599
Name: proportion, dtype: float64


Age to numeric

In [54]:
age_mapping = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25,
    "[30-40)": 35, "[40-50)": 45, "[50-60)": 55,
    "[60-70)": 65, "[70-80)": 75, "[80-90)": 85, "[90-100)": 95
}
df["age_numeric"] = df["age"].map(age_mapping)

Define feature lists

In [55]:
target_col = "readmitted_30"
id_cols = ["encounter_id", "patient_nbr"]

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in id_cols + [target_col]]

categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Numeric cols:", len(numeric_cols))
print("Categorical cols:", len(categorical_cols))

Numeric cols: 12
Categorical cols: 35


One-hot encode categoricals

In [56]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("After encoding:", df_encoded.shape)
df_encoded.head()

After encoding: (101766, 2412)


Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted_>30,readmitted_NO
0,2278392,8222157,6,25,1,1,41,0,1,0,...,False,False,False,False,False,False,True,False,False,True
1,149190,55629189,1,1,7,3,59,0,18,0,...,False,False,False,False,False,False,False,True,True,False
2,64410,86047875,1,1,7,2,11,5,13,2,...,False,False,False,False,False,False,True,True,False,True
3,500364,82442376,1,1,7,2,44,1,16,0,...,False,False,False,False,False,False,False,True,False,True
4,16680,42519267,1,1,7,1,51,0,8,0,...,False,False,False,False,False,False,False,True,False,True


Train / val / test split (stratified)

In [57]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=[target_col] + id_cols)
y = df_encoded[target_col]

# 15% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# From remaining 85%, take ~17.65% as val → overall ~15%
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape, "Pos rate:", y_train.mean())
print("Val:  ", X_val.shape, "Pos rate:", y_val.mean())
print("Test: ", X_test.shape, "Pos rate:", y_test.mean())

Train: (71233, 2409) Pos rate: 0.11159153763003103
Val:   (15268, 2409) Pos rate: 0.11160597327744302
Test:  (15265, 2409) Pos rate: 0.11162790697674418


Handle class imbalance with SMOTE (train only)

In [58]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Original train dist:")
print(y_train.value_counts(normalize=True))

print("\nBalanced train dist:")
print(y_train_bal.value_counts(normalize=True))

print("Balanced train shape:", X_train_bal.shape)

Original train dist:
readmitted_30
0    0.888408
1    0.111592
Name: proportion, dtype: float64

Balanced train dist:
readmitted_30
0    0.5
1    0.5
Name: proportion, dtype: float64
Balanced train shape: (126568, 2409)


Save processed splits

In [59]:
from pathlib import Path

data_processed_dir = Path("data") / "processed"
data_processed_dir.mkdir(parents=True, exist_ok=True)

X_train_bal.to_csv(data_processed_dir / "X_train_bal.csv", index=False)
y_train_bal.to_csv(data_processed_dir / "y_train_bal.csv", index=False)
X_val.to_csv(data_processed_dir / "X_val.csv", index=False)
y_val.to_csv(data_processed_dir / "y_val.csv", index=False)
X_test.to_csv(data_processed_dir / "X_test.csv", index=False)
y_test.to_csv(data_processed_dir / "y_test.csv", index=False)

print("Saved processed datasets in data/processed/")

Saved processed datasets in data/processed/
