# Bagged Decision Trees for Classification
#begging --- dataset
#boosting --prediction
#stacking --results

In [27]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=2020)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)
max_features = 3
kfold = KFold(n_splits=10, shuffle=True, random_state=2020)
decision_tree = DecisionTreeClassifier(max_features=max_features)
num_trees = 100
bagging_model = BaggingClassifier(base_estimator=decision_tree, n_estimators=num_trees,random_state=2020)
results = cross_val_score(bagging_model, X_train, y_train, cv=kfold)
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std()))



Accuracy: 0.96 (+/- 0.01)


# Decesion Tree

In [28]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Bagging Classifier
max_features = 3
kfold = KFold(n_splits=10, shuffle=True, random_state=2020)
# Decision Tree base estimator
dt = DecisionTreeClassifier(max_features=max_features)
num_trees = 100
bagging_model = BaggingClassifier(base_estimator=dt, n_estimators=num_trees, random_state=2020)
# Cross-validation
results = cross_val_score(bagging_model, X_train, y_train, cv=kfold)
print("Bagging Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std()))
# Random Forest Classifier
num_trees_rf = 100
max_features_rf = 3
kfold_rf = KFold(n_splits=10, shuffle=True, random_state=2020)
rf_model = RandomForestClassifier(n_estimators=num_trees_rf, max_features=max_features) 
num_trees=100
# Cross-validation
results_rf = cross_val_score(rf_model, X_train, y_train, cv=kfold_rf)
print("Random Forest Accuracy: %0.2f (+/- %0.2f)" % (results_rf.mean(), results.std()))



Bagging Accuracy: 0.96 (+/- 0.01)
Random Forest Accuracy: 0.96 (+/- 0.01)


# Adaboost Classifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
# Assuming train_x, train_y, test_x, test_y are your training and testing data
# Adjust these names based on your actual data
# Instantiate AdaBoostClassifier with DecisionTree base estimator
clf_boosting = AdaBoostClassifier(
 base_estimator=DecisionTreeClassifier(max_depth=1),
 n_estimators=200
)
# Fit the model
clf_boosting.fit(X_train, y_train)
# Make predictions
predictions = clf_boosting.predict(X_test)
# Calculate and print F1 Score and Accuracy
print("For Boosting: F1 Score {}, Accuracy {}".format(
 round(f1_score(y_test, predictions), 2),
 round(accuracy_score(y_test, predictions), 2)
))




For Boosting: F1 Score 0.93, Accuracy 0.93


# Random Forest as a Bagging classifier

In [30]:
clf_bagging = RandomForestClassifier(n_estimators=200 , max_depth= 1)
clf_bagging.fit(train_x , train_y )
predictions = clf_bagging.predict(test_x)
print("For Bagging : F1 Score {} , Accuracy {} ".format(round( f1_score (test_y,predictions) ,2) ,
                                                         round(accuracy_score (test_y , predictions ) ,2) ))

For Bagging : F1 Score 0.75 , Accuracy 0.74 


In [31]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
# Create a synthetic dataset
X, y = make_classification(
 n_samples=800, # Total number of samples
 n_features=20, # Number of features
 n_informative=10, # Number of informative features
 n_redundant=5, # Number of redundant features
 n_clusters_per_class=2, # Number of clusters per class
 weights=[0.5, 0.5], # Class distribution (balanced)
 random_state=42
)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.2, random_state=42
)
# Model training using RandomForestClassifier as an example
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
predictions = rf_model.predict(X_test)
# Calculate and print F1 Score and Accuracy
print("For Random Forest: F1 Score {}, Accuracy {}".format(
 round(f1_score(y_test, predictions), 2),
 round(accuracy_score(y_test, predictions), 2)
))


For Random Forest: F1 Score 0.88, Accuracy 0.88


# Stacking

In [32]:
# Create a synthetic dataset
X, y = make_classification(
 n_samples=800, # Total number of samples
 n_features=20, # Number of features
 n_informative=10, # Number of informative features
 n_redundant=5, # Number of redundant features
 n_clusters_per_class=2, # Number of clusters per class
 weights=[0.5, 0.5], # Class distribution (balanced)
 random_state=42
)
# Split the dataset into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(
 X, y, test_size=0.2, random_state=42
)



In [49]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
class NumberOfClassifierException(Exception):
    pass
class Stacking():
    def __init__(self, classifiers):
        if len(classifiers) < 2:
            raise NumberOfClassifierException("You must fit your classifier with 2 classifiers at least ");
        else:
            self._classifiers = classifiers
    def fit(self, data_x, data_y):
        stacked_data_x = data_x.copy()
        for classifier in self._classifiers[:-1]:
            classifier.fit(data_x, data_y)
            #stacked_data_x = np.column_stack((stacked_data_x, classifier.predic
            stacked_data_x = np.column_stack((stacked_data_x ,classifier.predict_proba(data_x)))
 
        last_classifier = self._classifiers[-1]
        last_classifier.fit(stacked_data_x, data_y)
    def predict(self, data_x):
        stacked_data_x = data_x.copy()
        for classifier in self._classifiers[:-1]:
            prob_predictions = classifier.predict_proba(data_x)
            #stacked_data_x = np.column_stack((stacked_data_x, prob_predictions)
            stacked_data_x = np.column_stack((stacked_data_x, prob_predictions))
        last_classifier = self._classifiers[-1]
        return last_classifier.predict(stacked_data_x)
# Creating classifiers
boosting_clf_ada_boost = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=3
)
clf_rf = RandomForestClassifier(
n_estimators=200,
max_depth=1,
random_state=2020
)
clf_adaboost = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=2020),
n_estimators=3
)
clf_logistic_reg = LogisticRegression(solver='liblinear', random_state=2020)
# Customizing and Exception message
classifiers_list = [clf_rf, clf_adaboost, clf_logistic_reg]
clf_stacking = Stacking(classifiers_list)
# Fit models
clf_rf.fit(train_x, train_y)
 
boosting_clf_ada_boost.fit(train_x, train_y)
clf_stacking.fit(train_x, train_y)
# Make predictions
predictions_bagging = clf_rf.predict(test_x)
predictions_boosting = boosting_clf_ada_boost.predict(test_x)
predictions_stacking = clf_stacking.predict(test_x)
# Print results
print("For Bagging: F1 Score {}, Accuracy {}".format(
round(f1_score(test_y, predictions_bagging), 2),
round(accuracy_score(test_y, predictions_bagging), 2)
))
print("For Boosting: F1 Score {}, Accuracy {}".format(
round(f1_score(test_y, predictions_boosting), 2),
round(accuracy_score(test_y, predictions_boosting), 2)
))
print("For Stacking: F1 Score {}, Accuracy {}".format(
round(f1_score(test_y, predictions_stacking), 2),
round(accuracy_score(test_y, predictions_stacking), 2)
))



For Bagging: F1 Score 0.73, Accuracy 0.72
For Boosting: F1 Score 0.71, Accuracy 0.71
For Stacking: F1 Score 0.81, Accuracy 0.8




# LabTask
1=Use BaggingClassifier and RandomForestClassifier from sklearn library and implement them
on diabetes data set.

In [52]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import diabetes 
data = load_diabetes()
X_df = data.data
y_train = data.target


max_features = 3
kfold = KFold(n_splits=10, shuffle=True, random_state=2020)
decision_tree = DecisionTreeClassifier(max_features=max_features)
num_trees = 100
bagging_model = BaggingClassifier(base_estimator=decision_tree, n_estimators=num_trees,random_state=2020)
results = cross_val_score(bagging_model, X_train, y_train, cv=kfold)
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std()))

ImportError: cannot import name 'diabetes' from 'sklearn.datasets' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\datasets\__init__.py)