In [None]:
# Cell 1: Title
# Majority Vote Classifier Demo

"""
This notebook demonstrates a custom ensemble method: the MajorityVoteClassifier.
It aggregates predictions from multiple classifiers using either class label or probability-based voting.

We'll evaluate its performance using:
- ROC AUC
- Precision-Recall curves
- Decision boundary plots
"""

# Cell 2: Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from majority_vote_classifier import MajorityVoteClassifier
from utils import plot_decision_region

# Cell 3: Data Preparation
iris = load_iris()
X, Y = iris.data[50:, [1, 2]], iris.target[50:]

labelenc = LabelEncoder()
y = labelenc.fit_transform(Y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=1)

# Cell 4: Classifier Setup
clf1 = make_pipeline(StandardScaler(), LogisticRegression(C=0.001, solver='lbfgs', random_state=1))
clf2 = DecisionTreeClassifier(max_depth=1, random_state=1, criterion='entropy')
clf3 = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=2, p=2, metric='minkowski'))

classifiers = [clf1, clf2, clf3]
clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

# Cell 5: Individual Classifier Performance
print("10-fold Cross Validation (ROC AUC):")
for clf, label in zip(classifiers, clf_labels):
    scores = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    print(f"{label}: ROC AUC = {scores.mean():.2f} (+/- {scores.std():.2f})")

# Cell 6: Majority Voting
mv_clf = MajorityVoteClassifier(classifiers=classifiers)
clf_labels.append("Majority Voting")
classifiers_mv = classifiers + [mv_clf]

print("\nWith Majority Voting Added:")
for clf, label in zip(classifiers_mv, clf_labels):
    scores = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    print(f"{label}: ROC AUC = {scores.mean():.2f} (+/- {scores.std():.2f})")

# Cell 7: ROC Curve
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
plt.figure(figsize=(6, 6))

for clf, label, clr, ls in zip(classifiers_mv, clf_labels, colors, linestyles):
    clf.fit(x_train, y_train)
    y_prob = clf.predict_proba(x_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_true=y_test, y_score=y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})', linestyle=ls, color=clr)

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on Test Set')
plt.legend()
plt.grid(alpha=0.5)
plt.show()

# Cell 8: Precision-Recall Curve
plt.figure(figsize=(6, 6))

for clf, label, clr, ls in zip(classifiers_mv, clf_labels, colors, linestyles):
    clf.fit(x_train, y_train)
    y_prob = clf.predict_proba(x_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    plt.plot(recall, precision, label=f'{label} (AUC = {pr_auc:.2f})', linestyle=ls, color=clr)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve on Test Set')
plt.legend()
plt.grid(alpha=0.5)
plt.show()

# Cell 9: Decision Regions
plt.figure(figsize=(10, 8))
for idx, (clf, label) in enumerate(zip(classifiers_mv, clf_labels)):
    clf.fit(x_train, y_train)
    plt.subplot(2, 2, idx + 1)
    plot_decision_region(x_train, y_train, clf)
    plt.title(label)
    plt.tight_layout()

plt.suptitle("Decision Regions", fontsize=14, y=1.02)
plt.show()