# ML-Training: Diabetes Readmission Prediction

This notebook demonstrates a machine learning workflow for predicting 30-day hospital readmissions for diabetes patients. The steps include data loading, preprocessing, model training, evaluation, and experiment tracking.

**STEP 1: Load Gold data**

Load the prepared gold-level diabetes readmission dataset from Unity Catalog.

In [0]:
# STEP 1: Load Gold data
gold_df = spark.table("diabetes_readmissions.gold_diabetes_ready")

# Check schema
gold_df.printSchema()

root
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- admission_type_id: integer (nullable = true)
 |-- discharge_disposition_id: integer (nullable = true)
 |-- admission_source_id: integer (nullable = true)
 |-- time_in_hospital: integer (nullable = true)
 |-- payer_code: string (nullable = true)
 |-- medical_specialty: string (nullable = true)
 |-- num_lab_procedures: integer (nullable = true)
 |-- num_procedures: integer (nullable = true)
 |-- num_medications: integer (nullable = true)
 |-- number_outpatient: integer (nullable = true)
 |-- number_emergency: integer (nullable = true)
 |-- number_inpatient: integer (nullable = true)
 |-- diag_1: string (nullable = true)
 |-- diag_2: string (nullable = true)
 |-- diag_3: string (nullable = true)
 |-- number_diagnoses: integer (nullable = true)
 |-- max_glu_serum: string (nullable = true)
 |-- A1Cresult: string (nullable = true)
 |-- m

In [0]:
# STEP 2: Sample data to avoid memory issues
# Sample a fraction of the data to reduce memory usage for local processing.
gold_sample = gold_df.sample(fraction=0.25, seed=42)

print("Sample row count:", gold_sample.count())

Sample row count: 25369


In [0]:
# STEP 3: Convert to Pandas
# Convert the sampled Spark DataFrame to a Pandas DataFrame for scikit-learn processing.
pdf = gold_sample.toPandas()

pdf.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_30
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0
1,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,0
2,Caucasian,Male,[40-50),,1,1,7,1,,,51,0,8,0,0,0,197.0,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,0
3,Caucasian,Male,[60-70),,3,1,2,4,,,70,1,21,0,0,0,414.0,411.0,V45,7,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,0
4,Caucasian,Male,[70-80),,1,1,7,5,,,73,0,12,0,0,0,428.0,492.0,250,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,0


In [0]:
# STEP 4: Define X and y
# Separate the target variable (`readmitted_30`) and features for modeling.
# y = what we want to predict
y = pdf["readmitted_30"]
# X = all other columns (patient + hospital + treatment info)
X = pdf.drop(columns=["readmitted_30"])

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (25369, 47)
y shape: (25369,)


In [0]:
# STEP 5: Encode categorical columns
# Encode categorical columns in X using LabelEncoder for model compatibility.
from sklearn.preprocessing import LabelEncoder

for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].astype(str)
        X[col] = LabelEncoder().fit_transform(X[col])

In [0]:
# STEP 6.5: Split and scale features
# Split data into train and test sets, then scale features using StandardScaler.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [0]:
# STEP 6.5: Scale features
# Scale features using StandardScaler (redundant if already done above).
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [0]:
# STEP 7: Train Logistic Regression (scaled)
# Train a logistic regression model using the scaled features.
from sklearn.linear_model import LogisticRegression

lr_scaled = LogisticRegression(
    max_iter=1000,
    solver="lbfgs"
)

lr_scaled.fit(X_train_scaled, y_train)

In [0]:
# STEP 8: Evaluate model
# Evaluate the trained model using ROC AUC score on the test set.
from sklearn.metrics import roc_auc_score

y_prob = lr_scaled.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, y_prob)

print("AUC:", auc)

AUC: 0.641453647480148


In [0]:
# STEP 9: Track with MLflow
# Log the model and metrics to MLflow for experiment tracking.
import mlflow
import mlflow.sklearn

mlflow.set_experiment("/Shared/diabetes_readmission_lr")

with mlflow.start_run(run_name="Logistic_Regression_Baseline"):
    mlflow.log_metric("AUC", auc)
    mlflow.sklearn.log_model(
        lr_scaled,
        "logistic_regression_model",
        input_example=X_train.iloc[:5]
    )
    print("MLflow run logged successfully")



MLflow run logged successfully


In [0]:
# STEP 10: Interpret coefficients
# Display the most important features based on logistic regression coefficients.
import pandas as pd

feature_importance_scaled = pd.DataFrame({
    "feature": X.columns,
    "coefficient": lr_scaled.coef_[0]
}).sort_values(by="coefficient", ascending=False)

feature_importance_scaled.head(10)

Unnamed: 0,feature,coefficient
15,number_inpatient,0.317947
46,diabetesMed,0.127849
5,discharge_disposition_id,0.095553
2,age,0.089377
14,number_emergency,0.081462
19,number_diagnoses,0.061756
7,time_in_hospital,0.055314
12,num_medications,0.036112
1,gender,0.033833
8,payer_code,0.032746
