# Bagging Regressor

In [13]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [10]:
X, y = make_regression(n_samples=300, n_features=10, n_informative=6, noise=30, random_state=42) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### With individual model

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

dt = DecisionTreeRegressor()
svr = SVR()

dt.fit(X_train, y_train)
svr.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_svr = svr.predict(X_test)

print(f'''
Accuracy 
Decision Tree : {r2_score(y_test, y_pred_dt)}
SVR : {r2_score(y_test, y_pred_svr)}
''')


Accuracy 
Decision Tree : 0.44272034915225267
SVR : 0.029659329738705553



# Bagging
#### Row sampling with replacement

In [16]:
bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=True,      # sample_replacement = True
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', r2_score(y_test, y_pred))

Accuracy : 0.6640825330516589


In [18]:
# using SVR

bag = BaggingRegressor(
    estimator=SVR(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=True,      # sample_replacement = True
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', r2_score(y_test, y_pred))

Accuracy : -0.006416544663729873


#### Not always better score - Tweak the hyperparametrs and see

# Pasting
#### Row sampling without replacement

In [19]:
bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=0.25,    # proportion of samples (if int given such as 50 then 50 samples will be taken)
    bootstrap=False,     # sample_replacement = False
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', r2_score(y_test, y_pred))

Accuracy : 0.6820833952805069


# Random Subspaces
#### Column sampling (with or without replacement)

In [20]:
bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=1.0,            # all rows taken
    bootstrap=False,            # wont get same sample twice for a model
    max_features=0.5,           # 50% of all features taken 
    bootstrap_features=True,    # feature replacement = True 
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', r2_score(y_test, y_pred))

Accuracy : 0.569887282420617


# Random Patches
#### Row + Column sampling (with or without replacement)

In [21]:
bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=0.25,            # 25% of rows taken
    max_features=0.5,            # 50% of all features taken 
    bootstrap_features=True,     # feature replacement = True 
    random_state=42
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy :', r2_score(y_test, y_pred))

Accuracy : 0.5110263498068925


### Attributes

In [22]:
bag.estimators_samples_    # samples for each estimator

[array([216, 147, 114, 120, 189,  28, 233, 220,  84,   6, 141,  68, 145,
         82,  51, 105,  74,  34,  93,   4,  34,   8, 142, 205,  32,  59,
         95,   3, 150, 107,  46, 116, 199, 194, 105, 181, 177,  77, 184,
         80, 213,  64,   0, 122,  59, 103, 235,  81,  41, 181,  91,  70,
        132,   0,  60, 184,  23, 156,  14, 234]),
 array([101, 234,   3, 169,  47, 202, 208,  83, 200, 163, 109,  99, 201,
        222, 104,  72,  18,  71, 139,  90, 117, 219,  51,  58,  75,  87,
        184, 107,  16, 227, 201, 177,  85,  84, 227,  91,   7, 193,  72,
        161,   3, 131, 216, 177,  61,  45,  31, 182, 226, 189, 229, 174,
         84, 171, 239,   2,   9, 178, 144,  57]),
 array([168, 188, 117, 184,  18,  53, 142,  52, 131, 120, 166, 126, 123,
         83, 165, 216,  40,  45,  29,  87, 214,  85, 196,  79,  64, 183,
          2,  33,  52, 144, 169,  18,  51,  80, 138, 164,  70,  11, 201,
        223,  52,  70,  19, 207,  27,  87,  96, 204,  38,  52, 174, 112,
         25, 118, 146,  

In [23]:
bag.estimators_features_    # features for each estimator

[array([9, 2, 9, 7, 7]),
 array([7, 3, 7, 3, 9]),
 array([6, 0, 7, 7, 9]),
 array([0, 5, 7, 4, 1]),
 array([6, 9, 6, 3, 3]),
 array([5, 2, 1, 2, 7]),
 array([3, 2, 1, 5, 2]),
 array([6, 2, 8, 2, 5]),
 array([8, 4, 4, 3, 2]),
 array([8, 1, 8, 9, 0]),
 array([3, 2, 9, 8, 4]),
 array([4, 7, 8, 6, 7]),
 array([3, 9, 4, 8, 6]),
 array([0, 9, 6, 7, 4]),
 array([0, 8, 2, 0, 2]),
 array([8, 4, 6, 6, 8]),
 array([7, 7, 2, 0, 3]),
 array([8, 4, 2, 8, 1]),
 array([7, 4, 2, 8, 6]),
 array([8, 1, 5, 0, 2]),
 array([5, 8, 8, 3, 0]),
 array([4, 6, 2, 7, 7]),
 array([1, 5, 2, 7, 1]),
 array([8, 8, 6, 9, 9]),
 array([1, 4, 2, 3, 7]),
 array([4, 1, 3, 1, 4]),
 array([8, 8, 0, 0, 8]),
 array([4, 2, 3, 2, 8]),
 array([2, 7, 8, 8, 6]),
 array([8, 5, 2, 7, 4]),
 array([3, 0, 6, 0, 6]),
 array([7, 6, 8, 0, 4]),
 array([3, 9, 7, 5, 8]),
 array([4, 5, 0, 5, 4]),
 array([4, 8, 7, 2, 0]),
 array([9, 2, 1, 0, 0]),
 array([8, 4, 4, 1, 5]),
 array([9, 7, 9, 3, 8]),
 array([8, 8, 9, 6, 3]),
 array([8, 9, 5, 3, 5]),


# OOB Score
#### Note - only applies when :
- Apllies to row sampling only
- Samples must be drawn with replacement

In [25]:
bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=0.25,            
    bootstrap=True,            # sample replacement must be True
    max_features=0.5,            
    bootstrap_features=True,     
    oob_score=True,            # to calculate oob score 
)

bag.fit(X_train, y_train)
print(r2_score(y_test, bag.predict(X_test)))
print(bag.oob_score_)     # note : oob_score is obtained after training

0.501043593367283
0.48639580080297884


In [26]:
bag

# Bagging Tips

- Bagging generally gives better results than Pasting
- Good results come around the 50% to 75% row sampling mark
- Random patches and subspaces should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

### Hyperparameter Tuning - Grid Search CV

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators' : [10,50,100,500],
    'max_samples' : [0.25, 0.5, 0.75, 1],
    'bootstrap' : [True, False],
    'max_features' : [0.25, 0.5, 0.75, 0.1],
    'bootstrap_features' : [True, False],
}

model = GridSearchCV(estimator=BaggingRegressor(estimator=DecisionTreeRegressor(), n_jobs=-1), param_grid=param_grid, verbose=1)   
model.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [28]:
model.best_params_

{'bootstrap': False,
 'bootstrap_features': False,
 'max_features': 0.75,
 'max_samples': 0.75,
 'n_estimators': 500}

In [29]:
model.best_score_

0.6946971626317631

# From 44% to 70% Damnnnnn 

In [None]:
print('Train R^2 Score : %.3f' %model.best_estimator_.score(X_train, Y_train))    # best_estimator score on train set
print('Test R^2 Score : %.3f'  %model.best_estimator_.score(X_test, Y_test))      # best estimator score on test set
print('Best R^2 Score Through Grid Search : %.3f' %model.best_score_)             # cross validated score