<a href="https://www.kaggle.com/code/paragghatage/bagging-pasting-partitioning?scriptVersionId=262252892" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# ---------- Helper ----------
def majority_vote(preds):
    """Take majority vote from multiple classifiers"""
    preds = np.array(preds)
    final = []
    for i in range(preds.shape[1]):
        counts = np.bincount(preds[:, i])
        final.append(counts.argmax())
    return np.array(final)

# ---------- Core Functions ----------
def partitioning(X_train, y_train, X_test, n_trees=5, random_state=42):
    N = len(X_train)
    idxs = np.arange(N)
    np.random.RandomState(random_state).shuffle(idxs)

    chunk_size = N // n_trees
    preds = []
    for i in range(n_trees):
        start = i * chunk_size
        end = (i+1) * chunk_size if i < n_trees-1 else N
        part_idx = idxs[start:end]
        clf = DecisionTreeClassifier(random_state=random_state+i)
        clf.fit(X_train[part_idx], y_train[part_idx])
        preds.append(clf.predict(X_test))
    return majority_vote(preds)

def pasting(X_train, y_train, X_test, n_trees=5, k=None, random_state=42):
    """Pasting = sample k without replacement per tree"""
    N = len(X_train)
    k = k or N//n_trees
    rng = np.random.RandomState(random_state)

    preds = []
    for i in range(n_trees):
        idx = rng.choice(N, size=k, replace=False)
        clf = DecisionTreeClassifier(random_state=random_state+i)
        clf.fit(X_train[idx], y_train[idx])
        preds.append(clf.predict(X_test))
    return majority_vote(preds)

def bagging(X_train, y_train, X_test, n_trees=5, random_state=42):
    """Bagging = bootstrap sample N with replacement per tree"""
    N = len(X_train)
    preds = []
    for i in range(n_trees):
        boot_X, boot_y = resample(X_train, y_train, n_samples=N,
                                  replace=True, random_state=random_state+i)
        clf = DecisionTreeClassifier(random_state=random_state+i)
        clf.fit(boot_X, boot_y)
        preds.append(clf.predict(X_test))
    return majority_vote(preds)

# ---------- Test on Dataset ----------
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42, stratify=y)

# Single Decision Tree (baseline)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
baseline_acc = accuracy_score(y_test, dt.predict(X_test))

# Partitioning
part_preds = partitioning(X_train, y_train, X_test, n_trees=10, random_state=42)
part_acc = accuracy_score(y_test, part_preds)

# Pasting
paste_preds = pasting(X_train, y_train, X_test, n_trees=10, k=len(X_train)//10, random_state=42)
paste_acc = accuracy_score(y_test, paste_preds)

# Bagging
bag_preds = bagging(X_train, y_train, X_test, n_trees=10, random_state=42)
bag_acc = accuracy_score(y_test, bag_preds)

# ---------- Results ----------
print("Single Tree Accuracy:", baseline_acc)
print("Partitioning Accuracy:", part_acc)
print("Pasting Accuracy:", paste_acc)
print("Bagging Accuracy:", bag_acc)


Single Tree Accuracy: 0.9181286549707602
Partitioning Accuracy: 0.9298245614035088
Pasting Accuracy: 0.9239766081871345
Bagging Accuracy: 0.935672514619883
