In [None]:
# %% [markdown]
# # Inspect PKL Files for Diabetes Project
# This notebook loads all necessary `.pkl` files and prints a summary.
# You can verify content before using them for evaluation (Member 5 work).

# %%
import joblib
import pandas as pd
import numpy as np

# %% [markdown]
# ## 1️⃣ Load Preprocessing Pipeline
# This shows which columns are transformed and what steps are applied.

# %%
pipeline_path = "src/preprocessing_pipeline.pkl"
pipeline = joblib.load(pipeline_path)
print("✅ Preprocessing Pipeline Loaded!\n")
print(pipeline)

# %% [markdown]
# ## 2️⃣ Load Trained Models
# This prints a summary of each model.

# %%
model_files = {
    "Logistic Regression": "models/logistic_regression_model.pkl",
    "Random Forest": "models/random_forest_model.pkl",
    "SVM": "models/svm_model.pkl",
    "XGBoost": "models/xgboost_model.pkl"
}

models = {}

for name, path in model_files.items():
    models[name] = joblib.load(path)
    print(f"\n✅ {name} Model Loaded:")
    print(models[name])

# %% [markdown]
# ## 3️⃣ Load Processed Data
# Check keys and shapes to verify training/validation splits.

# %%
data_path = "data/processed_diabetes_data.pkl"
data = joblib.load(data_path)

print("\n✅ Processed Data Loaded!\n")
print("Keys in processed data:", data.keys())

# Print shapes of arrays
for key in data.keys():
    print(f"{key} shape: {data[key].shape}")

# %% [markdown]
# ## 4️⃣ Optional: Preview a few rows
# Just to see first few samples of X_train and y_train

# %%
X_train = data['X_train']
y_train = data['y_train']

print("\nFirst 5 rows of X_train:")
print(X_train[:5])

print("\nFirst 5 labels of y_train:")
print(y_train[:5])
