<a href="https://colab.research.google.com/github/Saifullah785/machine-learning-engineer-roadmap/blob/main/Lecture_64_Bagging_Ensemble_Classifiers_%26_Regression/Lecture_64_part_02_bagging_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
# Generate a synthetic classification dataset
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3) #  n_redundant=0, random_state=0, shuffle=False

In [3]:
# Print the shape of the features array (number of samples, number of features)
X.shape

(10000, 10)

In [4]:
# Display the features array (X)
X

array([[ 0.97080114,  0.50259042,  2.45915803, ...,  0.8271113 ,
        -1.9306025 ,  0.1064598 ],
       [ 0.65794142, -0.07220087,  1.26201956, ...,  0.61584536,
        -1.11105578,  1.19334863],
       [ 0.43645226, -0.51106833,  0.02658565, ..., -1.52573252,
        -0.63095649,  1.55708913],
       ...,
       [-1.41220992,  0.08558096,  0.96109598, ...,  0.83852691,
         0.90825569,  1.0593133 ],
       [ 1.17054394,  0.05070486,  2.51728435, ...,  1.0295025 ,
        -1.78006888,  0.37118526],
       [ 0.57691234, -0.72781487,  0.25289304, ...,  0.37984162,
        -0.3138939 ,  0.04367133]])

In [5]:
# Display the target variable array (y)
y

array([1, 1, 0, ..., 0, 0, 1])

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize a Decision Tree Classifier
dt = DecisionTreeClassifier()

# Train the Decision Tree model on the training data
dt.fit(X_train, y_train)

In [8]:
# Make predictions on the test set using the Decision Tree model
y_pred = dt.predict(X_test)

# Calculate and print the accuracy score of the Decision Tree model
print("Decision Tree accuracy",accuracy_score(y_test, y_pred))

Decision Tree accuracy 0.8915


Bagging

In [9]:
# Initialize a Bagging Classifier with Decision Tree as the base estimator
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [10]:
# Train the Bagging Classifier model on the training data
bag.fit(X_train, y_train)

In [11]:
# Make predictions on the test set using the Bagging Classifier model
y_pred = bag.predict(X_test)

In [12]:
# Calculate and print the accuracy score of the Bagging Classifier model
accuracy_score(y_test, y_pred)

0.932

In [13]:
# Print the shape of the samples used for the first estimator in the Bagging model
bag.estimators_samples_[0].shape

(2000,)

In [14]:
# Print the shape of the features used for the first estimator in the Bagging model
bag.estimators_features_[0].shape

(10,)

# **Bagging using SVM**

In [15]:
# Initialize a Bagging Classifier using Support Vector Machine (SVC) as the base estimator
bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [16]:
# Train the Bagging Classifier model with SVC on the training data
bag.fit(X_train, y_train)

In [17]:
# Make predictions on the test set using the Bagging Classifier with SVC
y_pred = bag.predict(X_test)

# Calculate and print the accuracy score of the Bagging Classifier with SVC
print('Bagging using SVM',accuracy_score(y_test, y_pred))

Bagging using SVM 0.8985


# **Pasting**

In [18]:
# Initialize a Bagging Classifier for Pasting (bootstrap=False)
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose = 1,
    n_jobs=-1
)

In [19]:
# Train the Pasting Classifier model on the training data
bag.fit(X_train, y_train)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   13.4s finished


In [20]:
# Make predictions on the test set using the Pasting Classifier
y_pred = bag.predict(X_test)
# Calculate and print the accuracy score of the Pasting Classifier
print('Pasting classifier',accuracy_score(y_test, y_pred))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Pasting classifier 0.931


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished


# **Random Subspances**

In [21]:
# Initialize a Bagging Classifier for Random Subspaces (max_samples=1.0, bootstrap=False, max_features < 1.0, bootstrap_features=True)
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [22]:
# Train the Random Subspaces Classifier model on the training data
bag.fit(X_train, y_train)

In [23]:
# Make predictions on the test set using the Random Subspaces Classifier
y_pred = bag.predict(X_test)
# Calculate and print the accuracy score of the Random Subspaces Classifier
print('Random Subspaces classifier',accuracy_score(y_test, y_pred))
#

Random Subspaces classifier 0.9085


In [24]:
# Print the shape of the samples used for the first estimator in the Random Subspaces model
bag.estimators_samples_[0].shape

(8000,)

In [25]:
# Print the shape of the features used for the first estimator in the Random Subspaces model
bag.estimators_features_[0].shape

(5,)

# **Random Patches**

In [26]:
# Initialize a Bagging Classifier for Random Patches (max_samples < 1.0, bootstrap=True, max_features < 1.0, bootstrap_features=True)
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42

)

In [27]:
# Train the Random Patches Classifier model on the training data
bag.fit(X_train, y_train)

In [28]:
# Make predictions on the test set using the Random Patches Classifier
y_pred = bag.predict(X_test)
# Calculate and print the accuracy score of the Random Patches Classifier
print('Random Patches classifier',accuracy_score(y_test, y_pred))

Random Patches classifier 0.9015


# **OOB Score(Out of Bag Sample)**

In [29]:
# Initialize a Bagging Classifier with Out-of-Bag (OOB) scoring enabled
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42
)

In [30]:
# Train the Bagging Classifier model with OOB scoring on the training data
bag.fit(X_train, y_train)

In [31]:
# Print the Out-of-Bag (OOB) score of the Bagging Classifier model
bag.oob_score_

0.924625

In [32]:
# Make predictions on the test set using the Bagging Classifier model (with OOB scoring)
y_pred = bag.predict(X_test)
# Calculate and print the accuracy score on the test set
print('Random Patches classifier',accuracy_score(y_test, y_pred))

Random Patches classifier 0.932


**Bagging Tips**

- Bagging generally gives better results than pasting

- Good results come around the 25% to 50% row sampling mark

- Random patches and subspaces should be used while dealing with high dimensional data

- To find the correct hyperparameter values we can do Grid Search CV / RandomSearchCV

# **Applying GridSearch CV**

In [33]:
# Import GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [34]:
# Define the parameter grid for GridSearchCV
parameter ={
    'n_estimators' : [50,100,500],
    'max_samples' : [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
}

In [35]:
# Initialize GridSearchCV with BaggingClassifier and the defined parameter grid
search = GridSearchCV(BaggingClassifier(), parameter, cv=5)

In [37]:
# Perform the GridSearchCV to find the best hyperparameters
search.fit(X_train, y_train)

In [38]:
# Print the best score obtained from GridSearchCV
search.best_score_

In [39]:
# Print the best hyperparameters found by GridSearchCV
search.best_params_