# Implementing Ensemble Methods in Python
1. Bagging Example (Random Forest)



In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")


Random Forest Accuracy: 1.0


# 2. Boosting Example (AdaBoost)

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base classifier (Decision Tree)
base_classifier = DecisionTreeClassifier(max_depth=1)

# Initialize AdaBoost classifier
adaboost_classifier = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=50, random_state=42)

# Train the model
adaboost_classifier.fit(X_train, y_train)

# Make predictions
y_pred = adaboost_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"AdaBoost Accuracy: {accuracy}")


AdaBoost Accuracy: 1.0




# 3. Stacking

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train base models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Generate predictions from base models
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)

# Stack predictions for meta-learner
stacked_predictions = np.column_stack((rf_pred, gb_pred))

# Initialize meta-learner (Logistic Regression)
meta_learner = LogisticRegression()

# Train meta-learner on stacked predictions
meta_learner.fit(stacked_predictions, y_test)

# Make final predictions using the stacked model
stacked_pred = meta_learner.predict(stacked_predictions)

# Evaluate accuracy
accuracy = accuracy_score(y_test, stacked_pred)
print(f"Stacked Model Accuracy: {accuracy}")


Stacked Model Accuracy: 1.0


#4. Generalization Bounds and VC Dimension in Ensemble Methods


# Here’s a practical example using cross-validation to estimate generalization performance:

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluate model using cross-validation
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)  # 5-fold cross-validation
average_accuracy = cv_scores.mean()

print(f"Average Cross-Validation Accuracy: {average_accuracy}")


Average Cross-Validation Accuracy: 0.9666666666666668


# VC Dimension
The VC dimension is a measure of the model's capacity to fit various functions and generalize to new data. You can experiment with different ensemble configurations and analyze their impact on model complexity and generalization.

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")

# Calculate VC Dimension
n_train = X_train.shape[0]
vc_dimension = 2 * np.log2(n_train)  # For binary classification, adjust for multi-class
print(f"Estimated VC Dimension: {vc_dimension}")


Random Forest Accuracy: 1.0
Estimated VC Dimension: 13.813781191217037


# 5. Ensuring Diversity in Ensembles
To ensure diversity among models in an ensemble, you can vary algorithms, training data subsets, and feature sets:

Varying Algorithms Example


In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize diverse models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression()

# Train models on different algorithms
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

# Combine predictions (e.g., voting or averaging)
ensemble_pred = np.mean([rf_pred, gb_pred, lr_pred], axis=0)

# Evaluate ensemble accuracy
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
print(f"Ensemble Accuracy with Diverse Models: {ensemble_accuracy}")


Ensemble Accuracy with Diverse Models: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Varying Training Data Subsets (Bagging Example)


In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base classifier (Decision Tree)
base_classifier = DecisionTreeClassifier(max_depth=3)

# Initialize Bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=50, max_samples=0.5, max_features=0.5, random_state=42)

# Train the model
bagging_classifier.fit(X_train, y_train)

# Make predictions
y_pred = bagging_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Bagging Classifier Accuracy: {accuracy}")


Bagging Classifier Accuracy: 1.0




# Varying Feature Sets Example

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vary feature sets (for example, using only petal features)
X_train_petal = X_train[:, 2:]  # Use only petal length and width
X_test_petal = X_test[:, 2:]

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on petal features
rf_classifier.fit(X_train_petal, y_train)

# Make predictions using petal features
y_pred_petal = rf_classifier.predict(X_test_petal)

# Evaluate accuracy
accuracy_petal = accuracy_score(y_test, y_pred_petal)
print(f"Random Forest Accuracy with Petal Features: {accuracy_petal}")


Random Forest Accuracy with Petal Features: 1.0
