# Bagging Tips

1. Bagging generally gives better results than Pasting.
2. Good results come around 25% to 50% row sampling mark.
3. Random patches and subspaces should be used while dealing with high dimensional data
4. To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

In [33]:
from sklearn.datasets import make_classification

from sklearn.metrics import accuracy_score

from sklearn.ensemble import BaggingClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

In [34]:
X,y = make_classification(n_samples=10000,n_features=10,n_informative=3)

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [36]:
dtc = DecisionTreeClassifier(random_state=42)

dtc.fit(X_train,y_train)

y_pred = dtc.predict(X_test)

accuracy_score(y_test,y_pred)

0.8955

# Bagging

In [37]:
bag = BaggingClassifier(
                        base_estimator=DecisionTreeClassifier(),
                        n_estimators=100,
                        max_samples=0.25,
                        bootstrap=True,
                        random_state=42)

In [38]:
bag.fit(X_train,y_train)



In [39]:
y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)

0.9245

In [40]:
#kon se dtc object ko konse rows mile hai

bag.estimators_samples_

[array([2523, 3113, 7114, ..., 4291, 4472, 3620]),
 array([4782,  663, 7155, ..., 5963,  495, 1767]),
 array([5462, 6574, 4896, ..., 3979, 7827,   37]),
 array([2848, 2629, 1591, ..., 7723, 1314, 1565]),
 array([3821, 6494, 1606, ..., 5686, 7870, 2558]),
 array([2261, 7922, 3649, ..., 4478, 6286, 6943]),
 array([ 652, 1676, 2291, ..., 2723, 7007, 6344]),
 array([2478, 4107, 1958, ..., 7979, 5695, 7854]),
 array([5800, 3548, 6540, ..., 3899,  831,   55]),
 array([5256, 7181, 3409, ..., 5286, 7535, 1335]),
 array([2675, 2834, 3817, ..., 1726, 2323, 7642]),
 array([3236, 7607, 4600, ...,  445, 7501, 6604]),
 array([4563, 4137, 6298, ..., 6611, 3023, 5529]),
 array([2816, 5343, 5817, ..., 3197, 2917, 5775]),
 array([2448, 2733, 5480, ...,  747, 5842,   69]),
 array([4248, 3828, 4630, ..., 1284, 2542, 3370]),
 array([4815, 1867,  503, ..., 4692, 7952, 4855]),
 array([1688, 4132, 5218, ..., 2491, 1876,  471]),
 array([4167, 2900, 3602, ..., 2974, 5705, 1466]),
 array([2920, 7645, 5452, ..., 

In [41]:
#konse  konse cols mile hai dtc object ko

bag.estimators_features_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),


In [42]:
bag.estimators_samples_[0].shape

(2000,)

In [43]:
bag.estimators_features_[0].shape

(10,)

# Bagging using SVM

In [44]:
svc = SVC()

svc.fit(X_train,y_train)

y_pr = svc.predict(X_test)

accuracy_score(y_test,y_pr)

0.906

In [45]:
bag = BaggingClassifier(
base_estimator=SVC(),
n_estimators=100,
max_samples=0.25,
bootstrap=True,
random_state=42)

In [46]:
bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)



0.907

# Pasting

In [47]:
bag = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.25,
bootstrap=False,
random_state=42,
verbose=1,
n_jobs=-1)

In [48]:
bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    8.8s remaining:    8.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


0.9275

# Random Subspaces

In [49]:
bag = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=1.0,
bootstrap=False,
max_features=0.5,
bootstrap_features=True,
random_state=42
)

In [50]:
bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)



0.916

In [51]:
bag.estimators_samples_[0].shape

(8000,)

In [52]:
bag.estimators_features_[0].shape

(5,)

# Random patches

In [53]:
bag = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.25,
bootstrap=True,
max_features=0.5,
bootstrap_features=True,
random_state=42
)

In [54]:
bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)



0.9165

# OOB score

Out Of Bag Sample

koi rows har bar sampling me aata hai(63%) aur koi rows 1 bar bi nai aata (37%).

In [55]:
bag = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.25,
bootstrap=True,
oob_score=True,
random_state=42
)

In [56]:
bag.fit(X_train,y_train)

y_pred = bag.predict(X_test)

accuracy_score(y_test,y_pred)



0.9245

In [57]:
bag.oob_score_

0.918875

# Applying GridSearchCV

In [64]:
from sklearn.model_selection import GridSearchCV

In [65]:
parameters = {
    'n_estimators' : [5,10,50],
    'max_samples' : [0.1,0.25,0.5,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.2,0.5,0.7,1.0]
}

In [70]:
search = GridSearchCV(BaggingClassifier(n_jobs=-1),parameters,cv=2,n_jobs=-1)

In [71]:
search.fit(X_train,y_train)

In [72]:
search.best_params_

{'bootstrap': True,
 'max_features': 1.0,
 'max_samples': 0.5,
 'n_estimators': 50}

In [73]:
search.best_score_

0.9165