In [None]:
# 01 — Data Cleaning & Preprocessing Pipeline

This notebook prepares the Diabetes dataset for modeling:
- handle missing values
- create binary target variable
- drop irrelevant columns
- encode categorical variables
- scale numeric variables
- export processed data for Phase 3 modeling



In [12]:
!pip install scikit-learn





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\HP\AppData\Local\Programs\Python\Python314\python.exe -m pip install --upgrade pip


In [13]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [14]:
# === Load Raw Dataset ===
df = pd.read_csv("diabetic_data.csv")
df.head()


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [15]:
# === Replace '?' with NaN ===
df.replace("?", np.nan, inplace=True)


In [16]:
# === Binary Target: 30-day Readmission ===
df['readmit_30'] = df['readmitted'].map({"<30":1, "NO":0, ">30":0})
df['readmit_30'].value_counts()


readmit_30
0    90409
1    11357
Name: count, dtype: int64

In [17]:
# === Drop columns that are not features ===
drop_cols = ['encounter_id', 'patient_nbr', 'readmitted']
df.drop(columns=drop_cols, inplace=True)


In [18]:
# Drop target first
X = df.drop("readmit_30", axis=1)
y = df["readmit_30"]

# Then define numeric and categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()


In [19]:
# === One-Hot Encode categorical, scale numeric ===
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

X_processed = preprocessor.fit_transform(X)


In [20]:
# === Split Data ===
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)


In [21]:
print("Numeric:", num_cols)
print("Categorical:", cat_cols)
print("Target:", y.name)


Numeric: ['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
Categorical: ['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']
Target: readmit_30


In [22]:
# === Export for Phase 3 ===

from scipy import sparse
import os

os.makedirs("data/processed", exist_ok=True)

# Convert to sparse CSR (if not already)
X_train_sparse = sparse.csr_matrix(X_train)
X_test_sparse = sparse.csr_matrix(X_test)

# Save using scipy
sparse.save_npz("data/processed/X_train.npz", X_train_sparse)
sparse.save_npz("data/processed/X_test.npz", X_test_sparse)

# Labels can stay as npy
np.save("data/processed/y_train.npy", y_train)
np.save("data/processed/y_test.npy", y_test)
