In [4]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m890.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
I

In [5]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


In [10]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy ",accuracy_score(y_test,y_pred))

Decision Tree accuracy  0.841


## Bagging

In [18]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),  # The base model (a decision tree)
    n_estimators=500,                         # Number of base estimators (trees) in the ensemble
    max_samples=0.5,                          # Fraction of samples to draw for each estimator
    bootstrap=True,                           # Use bootstrap sampling (sampling with replacement)
    random_state=42                           # Seed for reproducibility
)

In [19]:
bag.fit(X_train,y_train)

In [20]:
bag_pred = bag.predict(X_test)

In [21]:
print("Bagging accuracy ",accuracy_score(y_test,bag_pred))

Bagging accuracy  0.8895


## Bagging using SVM

In [23]:
bag = BaggingClassifier(estimator=SVC(),n_estimators=500,max_samples=0.5,bootstrap=True,random_state=42)

In [24]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging SVM accuracy ",accuracy_score(y_test,y_pred))

Bagging SVM accuracy  0.89


## Pasting

In [27]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=500,max_samples=0.25,bootstrap=False,random_state=42, verbose=1, n_jobs=-1)

In [28]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting accuracy ",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    5.2s remaining:   15.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Pasting accuracy  0.89


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.3s finished


## Random Subspaces

In [29]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=500,max_samples=1.0,bootstrap=False,max_features=0.5,random_state=42)

In [30]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces accuracy ",accuracy_score(y_test,y_pred))

Random Subspaces accuracy  0.8835


In [31]:
bag.estimators_samples_[0].shape

(8000,)

In [32]:
bag.estimators_features_[0].shape

(5,)

## Random patches

## OOB score

In [33]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=500,max_samples=0.25,bootstrap=True,oob_score=True,random_state=42)

In [34]:
oob_bag = bag.fit(X_train,y_train)
print("OOB Score ",oob_bag.oob_score_)
y_pred = oob_bag.predict(X_test)
print("OOB accuracy ",accuracy_score(y_test,y_pred))


OOB Score  0.88675
OOB accuracy  0.889


## Applying GridSearchCV

In [35]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV
# apply grid search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_samples': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

search = GridSearchCV(bag, param_grid, n_jobs=-1, cv=5)
search.fit(X_train, y_train)
print("Best parameter ",search.best_params_)


Best parameter  {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 500}
