In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier , RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report

In [2]:
# set plot style design 
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [3]:
# load dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
print("data set shape ",X.shape)
print("Data Set target Name ",target_names)

data set shape  (150, 4)
Data Set target Name  ['setosa' 'versicolor' 'virginica']


In [4]:
# split data 
# 1 train data -> bootstrap on train only 
# 2 test data 
X_train , X_test , y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=42,stratify=y)
print(f"Training Set : {X_train.shape[0]} sample")
print(f"Testing Set : {X_test.shape[0]} sample")

Training Set : 105 sample
Testing Set : 45 sample


In [5]:
# manual bootstrap sampling 
def create_bootstrap_sample(X,y,random_state=None):
    # create a one bootstrap sample
    n = len(y)
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.random.choice(n,size=n,replace=True)
    X_boot = X[indices]
    y_boot = y[indices]
    # find out_of_bag OOB (sample)
    oob_mask = np.ones(n,dtype=bool)
    oob_mask[indices] = False
    X_oob = X[oob_mask]
    y_oob = y[oob_mask]

    return X_boot , y_boot , indices ,(X_oob,y_oob)

In [6]:
# create  and show  one example bootstrap sample
X_boot , y_boot , boot_indices ,(X_oob,y_oob) = create_bootstrap_sample(
    X_train,
    y_train,
    random_state=42
) 

In [7]:
# print all data 
print("Bootstrap sample")
print(f" Total Sample = {len(y_boot)}")
print(f" Unique Original indices = {len(np.unique(boot_indices))}")
total_oob = len(y_oob)/len(y_train)
print(f" OOB find out_of_bag  = {len(y_oob)} | {total_oob:.1%} of train")

Bootstrap sample
 Total Sample = 105
 Unique Original indices = 64
 OOB find out_of_bag  = 41 | 39.0% of train


In [8]:
# show class Distribution comparison 
orig_dist = np.bincount(y_test)/len(y_train)
print("Original Train => ",np.round(orig_dist,3))
boot_dist = np.bincount(y_boot)/len(y_boot)
print("bootstrap => ",np.round(boot_dist,3))

Original Train =>  [0.143 0.143 0.143]
bootstrap =>  [0.305 0.419 0.276]


In [18]:
# Manual Bagging - Example

def manual_bagging_predict(X_train,y_train,X_test,n_Trees=10,random_state=42):
    np.random.seed(random_state)
    predictions = [] # will store predictions from each tree
    for i in range(n_Trees):
        #bootstrap sample
        X_boot,y_boot,_,_ = create_bootstrap_sample(X_train,y_train,random_state=i)
        # train a tree 
        tree = DecisionTreeClassifier(random_state=i)
        tree.fit(X_boot,y_boot)
        # predict an test set
        pred = tree.predict(X_test)
        predictions.append(pred)
    #print(predictions)
     # Majority Vote 
    predictions = np.array(predictions)
    final_pred = np.apply_along_axis(lambda x : np.bincount(x).argmax(),axis=0, arr = predictions)
    return final_pred

# run bagging 

print("10 trees ")
manual_bag_pred = manual_bagging_predict(X_train,y_train,X_test,n_Trees=30)
manual_acc = accuracy_score(y_test,manual_bag_pred)
print(f"accuracy score {manual_acc:.4f}")

10 trees 
accuracy score 0.8889


In [25]:
# scikit-learn - bagging 
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=30,
    max_samples=30, # 100% boostrap sampling
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
bagging.fit(X_train,y_train)
bag_pred = bagging.predict(X_test)
bag_acc = accuracy_score(y_test,bag_pred)
print(f'sklearn bagging accuracy score {bag_acc:.4f}')

sklearn bagging accuracy score 0.9111


In [29]:
# random forest 
rf = RandomForestClassifier(
    n_estimators = 30,
    max_features="sqrt", # key subset of features 
    bootstrap=True,
    oob_score=True,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test,rf_pred)
rf_oob = rf.oob_score_

print(f'accuracy score = {rf_acc:.4f}')
print(f'OOB accuracy score = {rf_oob:.4f}')

accuracy score = 0.8889
OOB accuracy score = 0.9524


In [31]:
# comparison table
result = pd.DataFrame({
    'model':['Single Decision Tree','Manual Bagging 30 Tree','sklearn bagging 30','random forest(30) '],
    'Test Accuracy':[
        accuracy_score(y_test,DecisionTreeClassifier(random_state=42).fit(X_train,y_train).predict(X_test)),
        manual_acc,
        bag_acc,
        rf_acc
    ]
})
print('Performance comparison')
print(result.round(4).to_string(index=False))

Performance comparison
                 model  Test Accuracy
  Single Decision Tree         0.9333
Manual Bagging 30 Tree         0.8889
    sklearn bagging 30         0.9111
    random forest(30)          0.8889
