In [None]:
# Cell 1: Title and Overview
# Bagging From Scratch - Demo

"""
This notebook demonstrates a custom implementation of Bagging (Bootstrap Aggregating)
from scratch and compares it to sklearn's built-in BaggingClassifier.

Dataset: Wine (class 1 vs class 2 only)
Features: 'alcohol', 'od280/od315_of_diluted_wines'

Visualized using:
- ROC curves
- Decision boundary plots
"""

# Cell 2: Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from bagging_classifier import BaggingFromScratch
from utils import plot_decision_region

# Cell 3: Data Preparation
wine = load_wine()
mask = wine.target != 0
X = wine.data[mask][:, [wine.feature_names.index('alcohol'),
                        wine.feature_names.index('od280/od315_of_diluted_wines')]]
y = LabelEncoder().fit_transform(wine.target[mask])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

# Cell 4: Train Custom Bagging Model
n_estimators = 500
bag = BaggingFromScratch(n_estimators=n_estimators)
bag.fit(x_train, y_train)

y_train_pred = bag.predict(x_train)
y_test_pred = bag.predict(x_test)
train_acc_scratch = accuracy_score(y_train, y_train_pred)
test_acc_scratch = accuracy_score(y_test, y_test_pred)

y_proba_scratch = bag.predict_proba(x_test)[:, 1]
fpr_s, tpr_s, _ = roc_curve(y_test, y_proba_scratch)
auc_scratch = auc(fpr_s, tpr_s)

# Cell 5: Compare with sklearn Bagging
base_tree = DecisionTreeClassifier(criterion='entropy', random_state=1)
bag_sklearn = BaggingClassifier(estimator=base_tree,
                                 n_estimators=n_estimators,
                                 bootstrap=True,
                                 random_state=1)
bag_sklearn.fit(x_train, y_train)

train_acc_sklearn = bag_sklearn.score(x_train, y_train)
test_acc_sklearn = bag_sklearn.score(x_test, y_test)

y_proba_sklearn = bag_sklearn.predict_proba(x_test)[:, 1]
fpr_b, tpr_b, _ = roc_curve(y_test, y_proba_sklearn)
auc_sklearn = auc(fpr_b, tpr_b)

# Train a single tree to compare overfitting
base_tree.fit(x_train, y_train)
train_acc_base = base_tree.score(x_train, y_train)
test_acc_base = base_tree.score(x_test, y_test)

# Cell 6: Print Scores
print("Accuracy Comparison")
print(f"Base Decision Tree - Train/Test:     {train_acc_base:.3f} / {test_acc_base:.3f}")
print(f"Custom Bagging Model - Train/Test:   {train_acc_scratch:.3f} / {test_acc_scratch:.3f}")
print(f"Sklearn Bagging Model - Train/Test:  {train_acc_sklearn:.3f} / {test_acc_sklearn:.3f}")

# Cell 7: ROC Curve Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr_s, tpr_s, label=f'Scratch Bagging (AUC = {auc_scratch:.3f})', linestyle='--', color='tomato')
plt.plot(fpr_b, tpr_b, label=f'Sklearn Bagging (AUC = {auc_sklearn:.3f})', linestyle='-', color='steelblue')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Cell 8: Decision Boundary Plots
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plot_decision_region(x_train, y_train, bag)
plt.title("Scratch Bagging Model")
plt.xlabel('Alcohol')
plt.ylabel('OD280/OD315')

plt.subplot(1, 2, 2)
plot_decision_region(x_train, y_train, bag_sklearn)
plt.title("Sklearn Bagging Model")
plt.xlabel('Alcohol')
plt.ylabel('OD280/OD315')

plt.tight_layout()
plt.show()