# 02 - Feature Engineering: Healthcare Readmission

In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv("diabetic_data.csv")
df = df.replace('?', np.nan)


In [2]:

target_candidates = [c for c in df.columns if 'readmit' in c.lower() or 'target' in c.lower()]
target_col = target_candidates[0] if target_candidates else None
if target_col:
    print("Detected target:", target_col)
else:
    raise ValueError("❌ Target column not found — please specify manually.")


Detected target: readmitted


In [3]:

X = df.drop(columns=[target_col], errors='ignore')
y = df[target_col].map({'NO':0, '>30':1, '<30':1}).fillna(0).astype(int)
X = X.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty'], errors='ignore')


In [4]:

categorical_cols = [c for c in X.columns if X[c].dtype == 'object']
numerical_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
print(f"Categorical cols: {len(categorical_cols)} | Numerical cols: {len(numerical_cols)}")


Categorical cols: 33 | Numerical cols: 11


In [5]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
]) if len(numerical_cols) > 0 else 'drop'

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]) if len(categorical_cols) > 0 else 'drop'

transformers = []
if len(numerical_cols) > 0:
    transformers.append(('num', numeric_transformer, numerical_cols))
if len(categorical_cols) > 0:
    transformers.append(('cat', categorical_transformer, categorical_cols))

preprocessor = ColumnTransformer(transformers=transformers)
print("✅ Preprocessor created with valid columns.")


✅ Preprocessor created with valid columns.


In [6]:

X_transformed = preprocessor.fit_transform(X)
print("Transformed feature matrix shape:", X_transformed.shape)

import joblib
joblib.dump(preprocessor, "feature_preprocessor.pkl")
print("✅ Feature preprocessor saved as feature_preprocessor.pkl")


Transformed feature matrix shape: (101766, 2363)
✅ Feature preprocessor saved as feature_preprocessor.pkl


In [7]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, rf.predict(X_test)))


Accuracy: 0.6387442271789329
