In [1]:
import os
from pathlib import Path
import joblib
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (20, 10)
pd.set_option("display.max_columns", 300)

ROOT = Path(".").resolve()
print("Working in:", ROOT)

Working in: C:\Uni\T3\SIT764\New folder


In [2]:
# Step 2: Load RF & encoders, derive class names

rf = joblib.load("obesity_model.pkl")
encoders = joblib.load("encoders.pkl")

# Build class_names list in index order 0..6 from your encoder mapping
label_map = encoders.get("nobeyesdad", {})  # {label: index}
if not label_map:
    raise RuntimeError("encoders.pkl must contain key 'nobeyesdad' mapping labels → indices (0..6).")

# index → label list
max_idx = max(label_map.values())
class_names = [None] * (max_idx + 1)
for label, idx in label_map.items():
    class_names[idx] = label

print("Class names (by index):", class_names)


Class names (by index): ['Insufficient Weight', 'Normal Weight', 'Obesity Type_I', 'Obesity Type_II', 'Obesity Type_III', 'Overweight Level_I', 'Overweight Level_II']


In [3]:
# Load preprocessed dataset and build X, y

# Adjust filename if yours differs
DATA_FILE = "Final_combined_dataset.csv"
if not (ROOT / DATA_FILE).exists():
    raise FileNotFoundError(
        f"Could not find {DATA_FILE}. Place your preprocessed dataset (with NObeyesdad 0..6) in this folder."
    )

df = pd.read_csv(DATA_FILE)
print("Data shape:", df.shape)
print("Columns:", df.columns.tolist()[:30], "...")

# Target detection
TARGET_CANDIDATES = ["NObeyesdad", "nobeyesdad"]
target_col = next((c for c in TARGET_CANDIDATES if c in df.columns), None)
if target_col is None:
    raise ValueError("Target column 'NObeyesdad' (or 'nobeyesdad') not found in the dataset.")

# Prefer RF's feature order; fall back to df columns minus target
rf_feature_names = getattr(rf, "feature_names_in_", None)
if rf_feature_names is not None:
    feature_names = list(rf_feature_names)
else:
    # Derive from df: all non-target columns
    feature_names = [c for c in df.columns if c != target_col]

# Validate presence
missing = set(feature_names) - set(df.columns)
if missing:
    raise ValueError(f"Dataset is missing features expected by RF: {sorted(missing)}")

# Build X, y
X = df[feature_names].copy()
y = df[target_col].astype(int).copy()

# Coerce booleans to numeric (trees accept float)
for col in X.columns:
    if X[col].dtype == bool:
        X[col] = X[col].astype(int)
# Ensure numeric dtype (safe cast; fill NAs with 0 if any)
X = X.apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float)

print("X shape:", X.shape, " y shape:", y.shape)


Data shape: (3213, 24)
Columns: ['age', 'height', 'weight', 'family_history_with_overweight', 'favc', 'fcvc', 'ncp', 'smoke', 'ch2o', 'scc', 'faf', 'tue', 'gender_Male', 'caec_Always', 'caec_Frequently', 'caec_Sometimes', 'calc_Frequently', 'calc_Sometimes', 'calc_no', 'mtrans_Bike', 'mtrans_Motorbike', 'mtrans_Public_Transportation', 'mtrans_Walking', 'nobeyesdad'] ...
X shape: (3213, 19)  y shape: (3213,)


In [4]:
# Surrogate learns to mimic RF behavior
rf_preds = rf.predict(X[feature_names])
print("RF preds sample:", rf_preds[:12])

RF preds sample: [5 1 1 2 6 5 1 1 1 1 0 1]


In [5]:
#Train the surrogate Decision Tree

surrogate = DecisionTreeClassifier(
    max_depth=4,          # keep readable (tune 3–5)
    min_samples_leaf=20,  # avoid tiny leaves (tune as needed)
    random_state=42
)
surrogate.fit(X[feature_names], rf_preds)

# Fidelity: agreement between surrogate and RF
fidelity = (surrogate.predict(X[feature_names]) == rf_preds).mean()
print(f"Surrogate fidelity to RF: {fidelity:.4f}")

# Optional: compare both to ground truth y (just to report)
try:
    acc_rf_vs_y = accuracy_score(y, rf_preds)
    acc_surrogate_vs_y = accuracy_score(y, surrogate.predict(X[feature_names]))
    print(f"RF accuracy vs y:        {acc_rf_vs_y:.4f}")
    print(f"Surrogate acc vs y:      {acc_surrogate_vs_y:.4f}")
except Exception as e:
    print("Skipping ground-truth accuracy comparison:", e)


Surrogate fidelity to RF: 0.8382
RF accuracy vs y:        0.9701
Surrogate acc vs y:      0.8217


In [6]:
#Save bundle for your Streamlit app

bundle = {
    "model": surrogate,
    "feature_names": feature_names,
    "class_names": class_names,
    "fidelity": float(fidelity),
}
joblib.dump(bundle, "surrogate_dt.pkl")
print("Saved bundle →", (ROOT / "surrogate_dt.pkl").resolve())

Saved bundle → C:\Uni\T3\SIT764\New folder\surrogate_dt.pkl


In [14]:
pip install graphviz


Collecting graphviz
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Export a clearer, depth-limited view (recommended for reports)
from sklearn.tree import export_graphviz
import graphviz
from pathlib import Path

dot = export_graphviz(
    surrogate,
    out_file=None,
    feature_names=feature_names,
    class_names=class_names,
    filled=True,
    rounded=True,
    special_characters=True,
    max_depth=4
)

# Add spacing hints to avoid crowding
dot = dot.replace("digraph Tree {", "digraph Tree {\n  ranksep=1.2;\n  nodesep=0.6;\n")

# Export crisp SVG and PNG
graphviz.Source(dot, format="svg").render("surrogate_tree_depth4_gv", cleanup=True)
graphviz.Source(dot, format="png").render("surrogate_tree_depth4_gv@2x", cleanup=True)
print("Saved SVG/PNG via Graphviz.")


Saved SVG/PNG via Graphviz.
