<a href="https://colab.research.google.com/github/SangamSilwal/Machine-learning-Series/blob/main/Bagging_Classifier_day_30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000,n_features=10,n_informative=3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [4]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree Accuracy: ",accuracy_score(y_test,y_pred))

Decision Tree Accuracy:  0.886


# Bagging

In [7]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [8]:
bag.fit(X_train,y_train)

In [9]:
y_pred = bag.predict(X_test)

In [10]:
accuracy_score(y_test,y_pred)

0.938

In [14]:
bag.estimators_samples_[0:10]

[array([2523, 3113, 7114, ..., 4291, 4472, 3620]),
 array([4782,  663, 7155, ..., 5963,  495, 1767]),
 array([5462, 6574, 4896, ..., 3979, 7827,   37]),
 array([2848, 2629, 1591, ..., 7723, 1314, 1565]),
 array([3821, 6494, 1606, ..., 5686, 7870, 2558]),
 array([2261, 7922, 3649, ..., 4478, 6286, 6943]),
 array([ 652, 1676, 2291, ..., 2723, 7007, 6344]),
 array([2478, 4107, 1958, ..., 7979, 5695, 7854]),
 array([5800, 3548, 6540, ..., 3899,  831,   55]),
 array([5256, 7181, 3409, ..., 5286, 7535, 1335])]

In [16]:
bag.estimators_features_[0].shape

(10,)

# Bagging Using SVM

In [17]:
bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42,
    verbose=1  # verbose=1 shows the processing steps
)

In [18]:
bag.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.1s finished


In [19]:
y_pred = bag.predict(X_test)
print("Bagging with SVC accuracy score: ",accuracy_score(y_test,y_pred))

Bagging with SVC accuracy score:  0.915


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.7s finished


# Pasting
Sampling without Replacement

In [21]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

In [22]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting Accuracy: ",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Pasting Accuracy:  0.935


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished


# Random Subspaces
In Random Subspaces we only do column sampling no row sampling

In [23]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [24]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces Accuracy: ",accuracy_score(y_test,y_pred))


Random Subspaces Accuracy:  0.931


In [26]:
bag.estimators_samples_[0].shape #we can see no row is reduced

(8000,)

In [27]:
bag.estimators_features_[0].shape # column is divided

(5,)

# Random Patches

In [28]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [29]:
bag.fit(X_train,y_train)

In [30]:
y_pred = bag.predict(X_test)
print("Random Patches Accuracy: ",accuracy_score(y_test,y_pred))

Random Patches Accuracy:  0.9335


# OOB Score

In [32]:
# There are around 37% row which do not come out from the bag because of replacement
# We can use this rows to check the performance of the features
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

In [33]:
bag.fit(X_train,y_train)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.7s finished


In [34]:
bag.oob_score_

0.921125

The rows that are not used while training in bagging due to the replacement of rows as bootstrap=True, the rows are used to check the accuracy by declaring oob_score=True

# Bagging Tips
1. Bagging generally gives better result than Pasting
2. Good results come around the 25% to 50% row sampling mark
3. Random patches and subspaces should be used while dealing with the high dimensional datasets
4. To find the correct parameter we can use GridSearchCV / RandomSearchCV