In [13]:
!nvidia-smi

Sat Jan 17 12:48:07 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [14]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [15]:
X,y = make_classification(n_samples=10000, n_features=10,n_informative=3)

In [16]:
X

array([[ 1.65080003, -0.04896478,  0.83326893, ..., -0.64734432,
         0.87229314, -1.55265993],
       [-2.13834211, -1.04145662,  0.51032439, ..., -1.80630374,
         1.44183658, -1.05297771],
       [-0.42664236,  0.95923879,  0.3673115 , ..., -0.13668566,
         0.49780823,  2.36281816],
       ...,
       [-3.7187484 , -1.41321958,  0.05272883, ..., -4.21119601,
         3.82089365, -0.24852942],
       [-1.16742648,  0.50519544,  0.06523379, ..., -1.03598627,
         1.26056264, -0.31605093],
       [-1.78450068,  0.78347175,  0.73801986, ..., -1.61750457,
         1.43216579,  1.13193263]])

In [17]:
y

array([1, 0, 0, ..., 0, 0, 0])

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Single Decision Tree Classifier `Accuracy = 92%`

In [19]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.909


## Bagging using Decision Trees(Random Forest) `Accuracy = 95%`

In [20]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,# Number of trees
    max_samples=0.5, # Number of rows(1/2) to sample
    bootstrap=True, # With replacement
    random_state=42 # Random state
)

In [21]:
bag.fit(X_train,y_train)

In [22]:
y_pred = bag.predict(X_test)

### Huge change in Accuracy By Using Bagging(D-Trees) with replacement

In [23]:
accuracy_score(y_test,y_pred)

0.9425

In [24]:
bag.estimators_samples_[0].shape

(4000,)

In [25]:
bag.estimators_features_[0].shape

(10,)

## Bagging using SVM `Accuracy = 92%` 

In [26]:
bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500, # Number of trees 
    max_samples=0.5, # Number of rows(1/2) to sample
    bootstrap=True, # With replacement
    random_state=42 # Random state
)

In [27]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging using SVM",accuracy_score(y_test,y_pred))

Bagging using SVM 0.921


## Pasting with Decision Trees `Accuracy = 94%`

In [28]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500, # Number of trees
    max_samples=0.5, # Number of rows(1/2) to sample
    bootstrap=False, # Without replacement rows can't be duplicated
    random_state=42, # Random state
    verbose = 1, # Verbose to see the progress
    n_jobs=-1 # Number of cores
)

In [29]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting classifier",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   26.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Pasting classifier 0.9425


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished


## Random Subspaces `Accuracy = 92%` 

In [30]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500, # Number of trees
    max_samples=1.0, # Number of rows(1) to sample
    bootstrap=False, # Without replacement rows can't be duplicated
    max_features=0.5, # Number of features(1/2) to sample
    bootstrap_features=True, # With replacement
    random_state=42 # Random state
)

In [31]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces classifier",accuracy_score(y_test,y_pred))

Random Subspaces classifier 0.9265


In [32]:
bag.estimators_samples_[0].shape

(8000,)

In [33]:
bag.estimators_features_[0].shape

(5,)

## Random Patches `Accuracy = 92%`

In [34]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500, # Number of trees
    max_samples=0.25, # Number of rows(1/4) to sample
    bootstrap=True, # With replacement
    max_features=0.5, # Number of features(1/2) to sample
    bootstrap_features=True, # With replacement
    random_state=42 # Random state
)

In [35]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Patches classifier",accuracy_score(y_test,y_pred))

Random Patches classifier 0.927


## OOB Score is `95%`

### It is for those samples `37%` that don't come to models for training and we can use them for model evaluation. OOB score roughly tell the accuracy of the model.

In [36]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500, # Number of trees
    max_samples=0.25, # Number of rows(1/4) to sample
    bootstrap=True, # With replacement
    oob_score=True, # OOB Score
    random_state=42 # Random state
)

In [37]:
bag.fit(X_train,y_train)

In [38]:
bag.oob_score_

0.93875

In [39]:
y_pred = bag.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.9415


## Bagging Tips

- Bagging generally gives better results than Pasting
- Good results come around the 25% to 50% row sampling mark
- Random patches and subspaces should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

## Applying GridSearchCV

In [44]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [45]:
parameters = {
    'n_estimators': [50, 100],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0],
    'bootstrap': [True, False]
}


In [46]:
search = GridSearchCV(
    BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        oob_score=False,
        random_state=42
    ),
    parameters,
    cv=5,
    n_jobs=-1
)

In [47]:
search.fit(X_train, y_train)

In [48]:
search.best_params_


{'bootstrap': True,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 50}

In [49]:
search.best_score_

np.float64(0.9403750000000001)

In [50]:
best_model = search.best_estimator_

In [51]:
from sklearn.metrics import accuracy_score

y_test_pred = best_model.predict(X_test)
accuracy_score(y_test, y_test_pred)


0.939