# Bagging Classifier

In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [29]:
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### With individual model

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

dt = DecisionTreeClassifier()
svc = SVC()

dt.fit(X_train, y_train)
svc.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_svc = svc.predict(X_test)

print(f'''
Accuracy 
Decision Tree : {accuracy_score(y_test, y_pred_dt)}
SVC : {accuracy_score(y_test, y_pred_svc)}
''')


Accuracy 
Decision Tree : 0.91
SVC : 0.925



# Bagging
#### Row sampling with replacement

In [88]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=True,      # sample_replacement = True
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

Accuracy : 0.925


In [57]:
# using SVC

bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=True,      # sample_replacement = True
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

Accuracy : 0.885


#### Not always better score - Tweak the hyperparametrs and see

# Pasting
#### Row sampling without replacement

In [59]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=False,     # sample_replacement = False
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

Accuracy : 0.92


# Random Subspaces
#### Column sampling (with or without replacement)

In [64]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=1.0,            # all rows taken
    bootstrap=False,            # wont get same sample twice for a model
    max_features=0.5,           # 50% of all features taken 
    bootstrap_features=True,    # feature replacement = True 
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

Accuracy : 0.925


# Random Patches
#### Row + Column sampling (with or without replacement)

In [104]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.25,            # 25% of rows taken
    max_features=0.5,            # 50% of all features taken 
    bootstrap_features=True,     # feature replacement = True 
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

Accuracy : 0.93


### Attributes

In [100]:
bag.estimators_samples_    # samples for each estimator

[array([370, 632, 540, 745, 596,   6, 324, 338, 307, 105, 586, 290, 260,
          8, 398, 288, 510, 571, 607, 505, 515, 150, 363, 116, 455, 194,
        361, 437, 589, 184, 336, 754, 753, 725,  64, 768, 378,  59, 615,
        491, 337, 553, 437, 582, 132, 256, 572, 791, 412, 526, 746, 249,
        183, 203, 465, 340, 677, 100, 685, 725,  23, 582,   1, 337, 172,
        294, 353,  52, 521, 338, 327, 109,  73, 447, 217, 196, 499,  14,
        288, 616, 616, 557, 492, 565, 773, 146,  36, 303, 791, 752,  39,
        632, 292, 512, 470, 135, 254, 634, 437, 781, 734, 265,  15, 703,
        774, 333,  48,  27, 582, 476, 316, 476, 404, 685,  49, 131, 268,
        481,  85, 184, 303,  99, 375, 401, 513, 602, 756, 739, 166, 763,
        548, 598, 748, 576,  66, 723,  68, 332, 599, 733, 605, 110,  79,
        464, 577, 500, 208, 313, 711, 587,   5, 710, 200, 358, 270,   1,
        669,  26, 609, 272, 385, 751, 224, 472, 322, 687, 763, 650, 245,
         62, 121, 427,  11, 536,  74, 292, 535, 615

In [96]:
bag.estimators_features_    # features for each estimator

[array([9, 2, 9, 7, 7]),
 array([7, 3, 7, 3, 9]),
 array([6, 0, 7, 7, 9]),
 array([0, 5, 7, 4, 1]),
 array([6, 9, 6, 3, 3]),
 array([5, 2, 1, 2, 7]),
 array([3, 2, 1, 5, 2]),
 array([6, 2, 8, 2, 5]),
 array([8, 4, 4, 3, 2]),
 array([8, 1, 8, 9, 0]),
 array([3, 2, 9, 8, 4]),
 array([4, 7, 8, 6, 7]),
 array([3, 9, 4, 8, 6]),
 array([0, 9, 6, 7, 4]),
 array([0, 8, 2, 0, 2]),
 array([8, 4, 6, 6, 8]),
 array([7, 7, 2, 0, 3]),
 array([8, 4, 2, 8, 1]),
 array([7, 4, 2, 8, 6]),
 array([8, 1, 5, 0, 2]),
 array([5, 8, 8, 3, 0]),
 array([4, 6, 2, 7, 7]),
 array([1, 5, 2, 7, 1]),
 array([8, 8, 6, 9, 9]),
 array([1, 4, 2, 3, 7]),
 array([4, 1, 3, 1, 4]),
 array([8, 8, 0, 0, 8]),
 array([4, 2, 3, 2, 8]),
 array([2, 7, 8, 8, 6]),
 array([8, 5, 2, 7, 4]),
 array([3, 0, 6, 0, 6]),
 array([7, 6, 8, 0, 4]),
 array([3, 9, 7, 5, 8]),
 array([4, 5, 0, 5, 4]),
 array([4, 8, 7, 2, 0]),
 array([9, 2, 1, 0, 0]),
 array([8, 4, 4, 1, 5]),
 array([9, 7, 9, 3, 8]),
 array([8, 8, 9, 6, 3]),
 array([8, 9, 5, 3, 5]),


# OOB Score
#### Note - only applies when :
- Apllies to row sampling only
- Samples must be drawn with replacement

In [84]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.25,            
    bootstrap=True,            # sample replacement must be True
    max_features=0.5,            
    bootstrap_features=True,     
    oob_score=True,            # to calculate oob score 
)

bag.fit(X_train, y_train)
print(accuracy_score(y_test, bag.predict(X_test)))
print(bag.oob_score_)     # note : oob_score is obtained after training

0.945
0.9425


In [93]:
bag

# Bagging Tips

- Bagging generally gives better results than Pasting
- Good results come around the 50% to 75% row sampling mark
- Random patches and subspaces should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

### Hyperparameter Tuning - Grid Search CV

In [117]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators' : [10,50,100,500],
    'max_samples' : [0.25, 0.5, 0.75, 1],
    'bootstrap' : [True, False],
    'max_features' : [0.25, 0.5, 0.75, 0.1],
    'bootstrap_features' : [True, False],
}

model = GridSearchCV(estimator=BaggingClassifier(estimator=DecisionTreeClassifier(), n_jobs=-1), param_grid=param_grid, verbose=4)   
model.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits
[CV 1/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=10;, score=0.825 total time=   2.1s
[CV 2/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=10;, score=0.781 total time=   0.0s
[CV 3/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=10;, score=0.812 total time=   0.0s
[CV 4/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=10;, score=0.856 total time=   0.0s
[CV 5/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=10;, score=0.869 total time=   0.0s
[CV 1/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_estimators=50;, score=0.900 total time=   0.1s
[CV 2/5] END bootstrap=True, bootstrap_features=True, max_features=0.25, max_samples=0.25, n_est

In [118]:
model.best_params_

{'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 0.5,
 'max_samples': 0.75,
 'n_estimators': 50}

In [119]:
model.best_score_

0.9512499999999999

# From 91% (single DT) to 95% Damnnn

In [None]:
print('Train R^2 Score : %.3f' %model.best_estimator_.score(X_train, Y_train))    # best_estimator score on train set
print('Test R^2 Score : %.3f'  %model.best_estimator_.score(X_test, Y_test))      # best estimator score on test set
print('Best R^2 Score Through Grid Search : %.3f' %model.best_score_)             # cross validated score