In [34]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import json

In [2]:
df = pd.read_csv('diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
df.shape

(101766, 50)

 --- 2. Initial Column Cleaning and Dropping ---

 Drop columns with >35% missing data (weight, payer_code, medical_specialty)

 Drop unique identifiers (encounter_id, patient_nbr)

 patient_nbr is dropped for simplicity. A more advanced model
 
 would group by patient to prevent data leakage.

In [4]:
df = df.drop(columns=[
    'weight', 
    'payer_code', 
    'medical_specialty', 
    'encounter_id', 
    'patient_nbr'
])
print(f"Dropped columns. Shape after dropping: {df.shape}")

Dropped columns. Shape after dropping: (101766, 45)


 --- 3. Row Cleaning ---

 Drop 3 rows where gender is 'Unknown/Invalid'

In [5]:
df = df[df['gender'] != 'Unknown/Invalid']
# Drop rows with missing diagnosis codes (a small fraction)
df = df.dropna(subset=['diag_1', 'diag_2', 'diag_3'])
print(f"Dropped rows with missing data. Shape after dropping: {df.shape}")

Dropped rows with missing data. Shape after dropping: (101763, 45)


 --- 4. Fill Missing Categorical Data ---
 Fill missing 'race' with 'Unknown' category

In [6]:
df['race'] = df['race'].fillna('Unknown')
print("Filled missing 'race' values.")

Filled missing 'race' values.


--- 5. Feature Engineering (Diagnosis Codes) ---

Group ICD-9 codes into broader categories

In [7]:
def group_diag(diag_code):
    try:
        diag_code = float(diag_code)
        if 390 <= diag_code <= 459 or diag_code == 785:
            return 'Circulatory'
        elif 460 <= diag_code <= 519 or diag_code == 786:
            return 'Respiratory'
        elif 520 <= diag_code <= 579 or diag_code == 787:
            return 'Digestive'
        elif 250 <= diag_code < 251:
            return 'Diabetes'
        elif 800 <= diag_code <= 999:
            return 'Injury'
        elif 710 <= diag_code <= 739:
            return 'Musculoskeletal'
        elif 140 <= diag_code <= 239:
            return 'Neoplasms'
        else:
            return 'Other'
    except ValueError:
        # Handle non-numeric codes like 'E909' or 'V57'
        if 'E' in str(diag_code) or 'V' in str(diag_code):
            return 'Other'
        else:
            return 'Other'

In [8]:
df['diag_1_group'] = df['diag_1'].apply(group_diag)
df['diag_2_group'] = df['diag_2'].apply(group_diag)
df['diag_3_group'] = df['diag_3'].apply(group_diag)

In [9]:
# Drop the original columns
df = df.drop(columns=['diag_1', 'diag_2', 'diag_3'])
print("Engineered diagnosis codes into groups.")

Engineered diagnosis codes into groups.


 --- 6. Simplify Binary and Medication Columns ---
 
 Simplify binary columns

In [10]:
df['change'] = df['change'].map({'Ch': 1, 'No': 0})
df['diabetesMed'] = df['diabetesMed'].map({'Yes': 1, 'No': 0})

In [11]:
# Find medication columns (24 total)
med_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'
]

For medications, map 'No' to 0, and any other value ('Steady', 'Up', 'Down') to 1

This simplifies the problem to "Is the patient on this med?"

In [12]:
for col in med_cols:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 0 if x == 'No' else 1)
print("Simplified medication columns to binary (On/Off).")

Simplified medication columns to binary (On/Off).


 --- 7. Drop Zero-Variance Columns ---


 Check for columns with only one unique value

In [13]:
cols_to_drop = []
for col in df.columns:
    if df[col].nunique() == 1:
        cols_to_drop.append(col)
if cols_to_drop:
    df = df.drop(columns=cols_to_drop)
    print(f"Dropped zero-variance columns: {cols_to_drop}")

Dropped zero-variance columns: ['examide', 'citoglipton']


In [14]:
# --- 8. Define Features (X) and Target (y) ---
X = df.drop('readmitted', axis=1)
y = df['readmitted']

 --- 9. Split Data (Stratified) ---
 
 Stratify by y is crucial for imbalanced classes

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Data split. Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Data split. Train shape: (81410, 42), Test shape: (20353, 42)


 --- 10. Create Preprocessing Pipeline ---
 
 Identify numerical and categorical columns

In [None]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64', 'int32']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Its job is to automatically read all the column names in your X_train data and divide them into two separate lists:

# numerical_cols: This line tells pandas, "Go through all the columns in X_train and give me a list of only the columns that store numbers (like int64, float64, or int32)."

# categorical_cols: This line tells pandas, "Now, give me a list of all columns that store text." (In pandas, text columns are stored as the object data type).

# Why Do This?
# This is a crucial preparation step for your ColumnTransformer. You can't treat numbers and text the same way.

# Numerical columns (like time_in_hospital) need to be scaled (using StandardScaler).

# Categorical columns (like race or age) need to be one-hot encoded (using OneHotEncoder).

# By creating these two separate lists (numerical_cols and categorical_cols), you make it easy to tell your ColumnTransformer exactly which pipeline to apply to which columns.

In [None]:
# The engineered diag columns are 'object' type, so they will be in categorical_cols
# The simplified binary/med columns are 'int', so they will be in numerical_cols

# We need to make sure all columns are accounted for and in the right list.
# Let's be explicit:

In [35]:
# Find all columns that are not the target or the engineered/simplified ones
original_num_cols = [
    'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
    'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses'
]

In [18]:
original_cat_cols = [
    'race', 'gender', 'age', 'max_glu_serum', 'A1Cresult'
]

In [19]:
# Get the final list of columns that will be scaled (numerical)
num_cols_final = original_num_cols + med_cols + ['change', 'diabetesMed']

In [20]:
# Filter out any dropped columns (like examide, citoglipton)
num_cols_final = [col for col in num_cols_final if col in X_train.columns]

In [21]:
# Get the final list of columns that will be one-hot encoded (categorical)
cat_cols_final = original_cat_cols + ['diag_1_group', 'diag_2_group', 'diag_3_group']
cat_cols_final = [col for col in cat_cols_final if col in X_train.columns]

In [None]:
# Create the two sets to compare
actual_columns = set(X_train.columns)
my_columns = set(num_cols_final) | set(cat_cols_final) 

# Check if the sets are NOT equal
if actual_columns != my_columns:
    print("Error: Column mismatch!")
    print("Columns in data but not in lists:", actual_columns - my_columns)
    print("Columns in lists but not in data:", my_columns - actual_columns)
    # You could raise an error to stop the script, similar to assert
    raise ValueError("Column mismatch")
else:
    # If they are equal, print a success message and continue
    print("Column check passed: All features are accounted for.")

In [23]:
print(f"\nIdentified {len(num_cols_final)} numerical features.")
print(f"Identified {len(cat_cols_final)} categorical features.")


Identified 34 numerical features.
Identified 8 categorical features.


In [24]:
# Create pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [None]:
# Create the master preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols_final),
        ('cat', categorical_transformer, cat_cols_final)
    ],
    remainder='passthrough' 
)

In [None]:

y_encoder = LabelEncoder()

In [None]:

print("Fitting preprocessor on X_train...")
# Fit the preprocessor on X_train
X_train_processed = preprocessor.fit_transform(X_train)

# Only transform X_test
X_test_processed = preprocessor.transform(X_test)

Fitting preprocessor on X_train...
Transforming X_test...


In [None]:
print("Fitting target encoder on y_train...")
# Fit and transform y_train
y_train_processed = y_encoder.fit_transform(y_train)

# Only transform y_test
y_test_processed = y_encoder.transform(y_test)

Fitting target encoder on y_train...
Transforming y_test...


In [29]:
# Get feature names from the ColumnTransformer
feature_names = preprocessor.get_feature_names_out()

In [30]:
# Convert processed arrays back to DataFrames for saving
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
y_train_df = pd.DataFrame(y_train_processed, columns=['readmitted_encoded'])
y_test_df = pd.DataFrame(y_test_processed, columns=['readmitted_encoded'])


In [31]:
# --- 13. Save Processed Data and Encoders ---
print("Saving processed data to CSV files...")
X_train_df.to_csv('X_train_processed.csv', index=False)
X_test_df.to_csv('X_test_processed.csv', index=False)
y_train_df.to_csv('y_train_encoded.csv', index=False)
y_test_df.to_csv('y_test_encoded.csv', index=False)


Saving processed data to CSV files...


In [32]:
print("Saving fitted preprocessor and encoders...")
joblib.dump(preprocessor, 'preprocessor.joblib')
joblib.dump(y_encoder, 'target_encoder.joblib')


Saving fitted preprocessor and encoders...


['target_encoder.joblib']

In [33]:
# Also save the class mapping
class_mapping = {index: label for index, label in enumerate(y_encoder.classes_)}
print(f"Target variable encoding: {class_mapping}")
with open('target_class_mapping.json', 'w') as f:
    json.dump(class_mapping, f)

Target variable encoding: {0: '<30', 1: '>30', 2: 'NO'}
