
# Mushroom Toxicity — Logistic Regression with Dynamic Partial Inputs (MC Sampling)

This notebook:
1. Loads `mushrooms.csv` and feature importances from RandomForest (reads `/mnt/data/mushroom_feature_importances.csv`; recomputes if missing).
2. Keeps only features with **Importance ≥ 0.02** dynamically.
3. Trains a **Logistic Regression** model with **One-Hot Encoding**.
4. Provides `predict_partial()` which accepts **partial** user inputs and uses **Monte Carlo sampling** over missing features based on empirical distributions to return:
   - Mean probability of being poisonous
   - 5th–95th percentile interval
   - Final label using a configurable threshold (default 0.5)
5. Saves artifacts: pipeline, kept-features, category choices.

> Assumption: Users will select values from predefined choices (no unseen categories), but the encoder is still set to handle unknowns safely.


In [12]:

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import joblib
import json
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 200)
DATA_PATH = Path("data/mushrooms.csv")
IMPORTANCE_CSV = Path("data/mushroom_feature_importances.csv")
ART_DIR = Path("mushroom_lr_artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)


In [13]:
# === Load dataset ===
df = pd.read_csv(DATA_PATH)
assert "class" in df.columns, "Target column 'class' not found."

# === Load precomputed RF feature importances ===
IMPORTANCE_CSV = Path("data/output/mushroom_feature_importances.csv")
importances = pd.read_csv(IMPORTANCE_CSV).sort_values("Importance", ascending=False).reset_index(drop=True)

display(importances)

# === Select kept features by threshold ===
THRESH = 0.02  # 2%
kept_features = importances.loc[importances["Importance"] >= THRESH, "Feature"].tolist()
kept_features = [f for f in kept_features if f in df.columns and f != "class"]
print(f"Kept {len(kept_features)} features (Importance ≥ {THRESH}):")
kept_features


Unnamed: 0,Feature,Importance
0,odor,0.177295
1,gill-color,0.112875
2,gill-size,0.093022
3,spore-print-color,0.092381
4,ring-type,0.073294
5,stalk-root,0.063621
6,population,0.060087
7,bruises,0.057056
8,stalk-surface-above-ring,0.047542
9,gill-spacing,0.042421


Kept 13 features (Importance ≥ 0.02):


['odor',
 'gill-color',
 'gill-size',
 'spore-print-color',
 'ring-type',
 'stalk-root',
 'population',
 'bruises',
 'stalk-surface-above-ring',
 'gill-spacing',
 'stalk-surface-below-ring',
 'habitat',
 'stalk-shape']

In [14]:
# === Train Logistic Regression with One-Hot Encoding ===
X = df[kept_features].copy()
y = df["class"].map({"e": 0, "p": 1})  # poison=1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

pipe = Pipeline(steps=[
    ("onehot", ohe),
    ("clf", LogisticRegression(max_iter=200, solver="lbfgs"))
])

# === Step 1: Cross-validation on training data ===
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="roc_auc")
print(f"5-Fold CV ROC-AUC Scores: {cv_scores}")
print(f"Mean CV ROC-AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# === Step 2: Fit the final model on full training set ===
pipe.fit(X_train, y_train)

# === Step 3: Evaluate on hold-out test set ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"\nFinal Test Accuracy: {acc:.4f}  |  ROC-AUC: {auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

5-Fold CV ROC-AUC Scores: [1. 1. 1. 1. 1.]
Mean CV ROC-AUC: 1.0000 ± 0.0000

Final Test Accuracy: 1.0000  |  ROC-AUC: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1052
           1       1.00      1.00      1.00       979

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031

Confusion Matrix:
 [[1052    0]
 [   0  979]]


In [15]:
# === Category choices & empirical distributions for MC sampling ===
category_choices = {
    col: sorted(X_train[col].dropna().unique().tolist()) for col in kept_features
}

empirical_probs = {}
for col in kept_features:
    vc = X_train[col].value_counts(normalize=True)
    empirical_probs[col] = {cat: float(vc.get(cat, 0.0)) for cat in category_choices[col]}

print("Sample choices:")
for k in list(category_choices.keys())[:5]:
    print(k, "->", category_choices[k])


Sample choices:
odor -> ['a', 'c', 'f', 'l', 'm', 'n', 'p', 's', 'y']
gill-color -> ['b', 'e', 'g', 'h', 'k', 'n', 'o', 'p', 'r', 'u', 'w', 'y']
gill-size -> ['b', 'n']
spore-print-color -> ['b', 'h', 'k', 'n', 'o', 'r', 'u', 'w', 'y']
ring-type -> ['e', 'f', 'l', 'n', 'p']


# Generate the model artifacts

In [21]:
from types import SimpleNamespace

# === 2) define the encapsulation logic (conditional Monte Carlo version) ===
def make_predictor(pipe, kept_features, category_choices, empirical_probs, df_train):
    """
    Create a predictor object with conditional Monte Carlo sampling.
    It fills missing features based on conditional distributions learned from df_train.
    """

    def predict_from_partial(partial_features, n_samples=200, random_state=42):
        rng = np.random.default_rng(random_state)
        rows = []

        for _ in range(n_samples):
            row = {}
            known = {k: v for k, v in partial_features.items() if v is not None and v != ""}

            subset = df_train.copy()
            for k, v in known.items():
                if k in subset.columns:
                    subset = subset[subset[k] == v]

            if len(subset) < 10:
                subset = df_train

            for col in kept_features:
                if col in known:
                    row[col] = str(known[col])
                else:
                    cats = category_choices[col]
                    probs = subset[col].value_counts(normalize=True).to_dict()
                    p = np.array([probs.get(c, 0.0) for c in cats], dtype=float)
                    p = p / p.sum() if p.sum() > 0 else np.ones_like(p) / len(p)
                    row[col] = rng.choice(cats, p=p)

            rows.append(row)

        X_mc = pd.DataFrame(rows, columns=kept_features)
        probs = pipe.predict_proba(X_mc)[:, 1]
        return float(probs.mean())

    return SimpleNamespace(
        predict_from_partial=predict_from_partial,
        pipe=pipe,
        kept_features=kept_features,
        category_choices=category_choices
    )

In [22]:
import cloudpickle

pp = make_predictor(pipe, kept_features, category_choices, empirical_probs, X_train)
bundle_path = ART_DIR / "mushroom_predict_bundle.pkl"
with open(bundle_path, "wb") as f:
    cloudpickle.dump(pp, f)

print(f"✅ encapsulation success: {bundle_path}")


✅ encapsulation success: mushroom_lr_artifacts\mushroom_predict_bundle.pkl
