# how to create a voting classifier using Scikit-Learn's VotingClassifier to combine multiple classifiers and evaluate their performance on a synthetic dataset.

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Generate the dataset
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a Voting Classifier with three diverse classifiers
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    voting='hard'  # Start with hard voting
)

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Evaluate individual classifiers
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

# Predict using the voting classifier
print("Voting Classifier prediction:", voting_clf.predict(X_test[:1]))
print("Individual predictions:", [clf.predict(X_test[:1]) for clf in voting_clf.estimators_])

# Evaluate Voting Classifier's performance (hard voting)
print("Voting Classifier (hard voting) accuracy:", voting_clf.score(X_test, y_test))

# Change to soft voting and re-train
voting_clf.voting = "soft"
voting_clf.fit(X_train, y_train)

# Evaluate Voting Classifier's performance (soft voting)
print("Voting Classifier (soft voting) accuracy:", voting_clf.score(X_test, y_test))


lr = 0.864
rf = 0.896
svc = 0.896
Voting Classifier prediction: [1]
Individual predictions: [array([1]), array([1]), array([0])]
Voting Classifier (hard voting) accuracy: 0.912
Voting Classifier (soft voting) accuracy: 0.92


# Bagging and Pasting

In [3]:
from sklearn.datasets import make_moons
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a synthetic dataset
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a BaggingClassifier with DecisionTreeClassifier as the base estimator
bag_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(),  # Use 'estimator' instead of 'base_estimator'
    n_estimators=500,
    max_samples=100,
    n_jobs=-1,  # Use all available cores
    random_state=42
)

# Fit the BaggingClassifier on the training set
bag_clf.fit(X_train, y_train)

# Perform Out-of-Bag evaluation
bag_clf_oob = BaggingClassifier(
    estimator=DecisionTreeClassifier(),  # Use 'estimator' instead of 'base_estimator'
    n_estimators=500,
    oob_score=True,  # Enable OOB evaluation
    n_jobs=-1,
    random_state=42
)

# Fit the BaggingClassifier with OOB evaluation
bag_clf_oob.fit(X_train, y_train)

# Retrieve the OOB score
oob_score = bag_clf_oob.oob_score_
print("OOB Score:", oob_score)

# Predict on the test set
y_pred = bag_clf_oob.predict(X_test)

# Calculate and print the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

# Get OOB decision function probabilities for the first 3 instances
oob_decision_function = bag_clf_oob.oob_decision_function_[:3]
print("OOB Decision Function Probabilities for the first 3 instances:\n", oob_decision_function)


OOB Score: 0.896
Test Set Accuracy: 0.92
OOB Decision Function Probabilities for the first 3 instances:
 [[0.32352941 0.67647059]
 [0.3375     0.6625    ]
 [1.         0.        ]]


# This code includes training a RandomForestClassifier and a BaggingClassifier

In [4]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train a RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

# Predict using the Random Forest model
y_pred_rf = rnd_clf.predict(X_test)

# Display feature importances
print("Feature Importances from Random Forest Classifier:")
for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(round(score, 2), name)

# Equivalent BaggingClassifier using DecisionTreeClassifier
bag_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16),
    n_estimators=500,
    n_jobs=-1,
    random_state=42
)

# Fit the BaggingClassifier
bag_clf.fit(X_train, y_train)

# Predict using the Bagging model
y_pred_bag = bag_clf.predict(X_test)

# Display predictions
print("\nPredictions from Bagging Classifier:", y_pred_bag)


Feature Importances from Random Forest Classifier:
0.11 sepal length (cm)
0.03 sepal width (cm)
0.44 petal length (cm)
0.41 petal width (cm)

Predictions from Bagging Classifier: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]


# 1. Boosting (AdaBoost)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

# Generate the moons dataset
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create and train an AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # Changed from base_estimator to estimator
    n_estimators=30,
    learning_rate=0.5,
    random_state=42
)
ada_clf.fit(X_train, y_train)

# Score the AdaBoostClassifier
print("AdaBoost Classifier Score:", ada_clf.score(X_test, y_test))



AdaBoost Classifier Score: 0.904




# Gradient Boosting

In [7]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Generate a noisy quadratic dataset
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)  # y = 3x² + Gaussian noise

# Train the first DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

# Train a second DecisionTreeRegressor on the residuals
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=43)
tree_reg2.fit(X, y2)

# Train a third regressor on the residuals
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=44)
tree_reg3.fit(X, y3)

# Predictions for new data
X_new = np.array([[-0.4], [0.], [0.5]])
predictions = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
print("Predictions:", predictions)

# Train a Gradient Boosting Regressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt.fit(X, y)

# Early stopping with Gradient Boosting Regressor
gbrt_best = GradientBoostingRegressor(
    max_depth=2,
    learning_rate=0.05,
    n_estimators=500,
    n_iter_no_change=10,
    random_state=42
)
gbrt_best.fit(X, y)

# Display the number of estimators used
print("Number of estimators after early stopping:", gbrt_best.n_estimators_)


Predictions: [0.49484029 0.04021166 0.75026781]
Number of estimators after early stopping: 92


# 3. Pipeline with HistGradientBoosting

In [11]:
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder

# Load the California housing dataset
housing = fetch_california_housing(as_frame=True).data
housing_labels = fetch_california_housing(as_frame=True).target

# Create a pipeline with HistGradientBoostingRegressor
hgb_reg = make_pipeline(
    make_column_transformer((OrdinalEncoder(), ["ocean_proximity"]), remainder="passthrough"),
    HistGradientBoostingRegressor(categorical_features=[0], random_state=42)
)




# Stacking Classifier

In [12]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Create a Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)

# Fit the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Score the Stacking Classifier
print("Stacking Classifier Score:", stacking_clf.score(X_test, y_test))


Stacking Classifier Score: 0.928
