In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
import numpy as np

In [2]:
x,y=make_classification(n_samples=10000, n_features=10, n_informative=3)

In [3]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [4]:
dt=DecisionTreeClassifier(random_state=42)
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

print("Decision Tree Accuracy : ",np.round(accuracy_score(y_test,y_pred),2))

Decision Tree Accuracy :  0.94


## BAGGING:

Bagging Using Decision Tree:


In [5]:
bag= BaggingClassifier(
    base_estimator= DecisionTreeClassifier(),
    n_estimators=500,
    max_samples= 0.3,
    bootstrap= True,
    random_state=42
)

bag.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.3,
                  n_estimators=500, random_state=42)

In [6]:
y_pred= bag.predict(x_test)

print("Bagging Classifier Accuracy : ",np.round(accuracy_score(y_test,y_pred),2))

Bagging Classifier Accuracy :  0.96


Bagging Using SVM:

In [7]:
bag = BaggingClassifier(
    base_estimator= SVC(),
    n_estimators=500,
    max_samples=0.3,
    bootstrap= True,
    random_state= 42,
    n_jobs=-1
)

bag.fit(x_train,y_train)

BaggingClassifier(base_estimator=SVC(), max_samples=0.3, n_estimators=500,
                  n_jobs=-1, random_state=42)

In [8]:
print("Bagging SVM accuracy : ",np.round(accuracy_score(y_test,bag.predict(x_test)),2))

Bagging SVM accuracy :  0.95


## PASTING:


In [9]:
bag= BaggingClassifier(
    base_estimator= DecisionTreeClassifier(),
    n_estimators=500,
    max_samples= 0.3,
    bootstrap= False,
    random_state=42,
    n_jobs=-1,
    verbose= 1
)

bag.fit(x_train,y_train)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    9.6s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    9.6s finished


BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.3, n_estimators=500, n_jobs=-1, random_state=42,
                  verbose=1)

In [10]:
y_pred= bag.predict(x_test)

print("Pasting Accuracy : ",np.round(accuracy_score(y_test,y_pred),2))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Pasting Accuracy :  0.96


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s finished


## RANDOM SUBSPACES

In [11]:
bag= BaggingClassifier(
    base_estimator= DecisionTreeClassifier(),
    n_estimators=500,
    max_samples= 1.0,
    bootstrap= False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

bag.fit(x_train,y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  bootstrap_features=True, max_features=0.5, n_estimators=500,
                  random_state=42)

In [12]:
y_pred= bag.predict(x_test)

print("Random Subspaces Accuracy : ",np.round(accuracy_score(y_test,y_pred),2))

Random Subspaces Accuracy :  0.96


## RANDOM PATCHES

In [13]:
bag= BaggingClassifier(
    base_estimator= DecisionTreeClassifier(),
    n_estimators=500,
    max_samples= 0.3,
    bootstrap= True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

bag.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=0.5, max_samples=0.3,
                  n_estimators=500, random_state=42)

In [14]:
y_pred= bag.predict(x_test)

print("Random Patches Accuracy : ",np.round(accuracy_score(y_test,y_pred),2))

Random Patches Accuracy :  0.96


###  OOB SCORE

In [15]:
bag= BaggingClassifier(
    base_estimator= DecisionTreeClassifier(),
    n_estimators=500,
    max_samples= 0.3,
    bootstrap= True,
    oob_score= True,
    random_state=42
)

bag.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.3,
                  n_estimators=500, oob_score=True, random_state=42)

In [16]:
bag.oob_score_

0.96175

In [17]:
accuracy_score(y_test,bag.predict(x_test))

0.9625


### GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
parameters={
    'n_estimators':[50,100,250,500],
    'max_samples': [0.25,0.3,0.4,0.5,0.7],
    'bootstrap' :[True,False],
    'max_features' : [0.25,0.3,0.5,0.75,1.0]
}

In [22]:
grid=GridSearchCV(BaggingClassifier(n_jobs=-1),parameters,n_jobs=-1,verbose=1,cv=5)

In [23]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


GridSearchCV(cv=5, estimator=BaggingClassifier(n_jobs=-1), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_features': [0.25, 0.3, 0.5, 0.75, 1.0],
                         'max_samples': [0.25, 0.3, 0.4, 0.5, 0.7],
                         'n_estimators': [50, 100, 250, 500]},
             verbose=1)

In [24]:
grid.best_params_

{'bootstrap': False,
 'max_features': 0.75,
 'max_samples': 0.7,
 'n_estimators': 500}

In [25]:
grid.best_score_

0.9643750000000001

## BAGGING TIPS:






*   Usually, Bagging gives better results than Pasting.


*   Good Results come around the 25% to 50% row sampling mark.


*   Random Patches and Subspaces should be used while dealing with high dimensional data.

*   To find the correct hyperparameter values use GridSearchCV/RandomSearchCV.



