# Implementing Ensemble Learning using Scikit Learn

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import preprocessing

In [None]:
# Load the diabetes dataset

df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
# Setting features (X) and target (y)
X = df.iloc[:,:-1].values
y = df.iloc[:,8].values

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
# Create a decision tree base model
base_model = DecisionTreeClassifier(criterion='entropy',
                                    max_depth=None)

In [None]:
# Train the base_model classifier
base_model.fit(X_train, y_train)

# Evaluate the performance
basetrainAcc = base_model.score(X_train, y_train)
print("Training Accuracy:", basetrainAcc)
basetestAcc = base_model.score(X_test, y_test)
print("Testing Accuracy:", basetestAcc)

## BaggingClassifier 

In [None]:
# Initialize the bagging classifier
bagging_clf = BaggingClassifier(base_estimator=base_model,
                                n_estimators=32,
                                random_state=42)

In [None]:
# Train the bagging classifier
bagging_clf.fit(X_train, y_train)

In [None]:
# Evaluate the performance
trainbagAcc = bagging_clf.score(X_train, y_train)
print("Training Accuracy:", trainbagAcc)
testbagAcc = bagging_clf.score(X_test, y_test)
print("Testing Accuracy:", testbagAcc)

## Using OOB 

In [None]:
# Initialize the bagging classifier
bagging_clf2 = BaggingClassifier(base_estimator=base_model,
                                n_estimators=32,
                                random_state=42,
                                oob_score=True)
# Train the bagging classifier
bagging_clf2.fit(X, y)
# Evaluate the performance
trainbagAcc2 = bagging_clf2.score(X, y)
print("Training Accuracy:", trainbagAcc2)
#testbagAcc = bagging_clf.score(X_test, y_test)
#print("Testing Accuracy:", testbagAcc)
oobaccuracy2 = bagging_clf2.oob_score_
print("OOB Accuracy:", oobaccuracy2)

## Using Pasting 

In [None]:
# Initialize the pasting classifier
pasting_clf = BaggingClassifier(base_estimator=base_model,
                                n_estimators=32,
                                random_state=42,
                                bootstrap=False)

# Train the pasting classifier
pasting_clf.fit(X_train, y_train)

# Evaluate the performance
trainpastingAcc = pasting_clf.score(X_train, y_train)
print("Training Accuracy:", trainpastingAcc)
testpastingAcc = pasting_clf.score(X_test, y_test)
print("Testing Accuracy:", testpastingAcc)

## Bagging - Random Forests 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100,
                            criterion='gini',
                            random_state=42)

# Train the Random Forest classifier
rf.fit(X_train, y_train)

# Evaluate the performance
trainrfAcc = rf.score(X_train, y_train)
print("Training Accuracy:", trainrfAcc)
testrfAcc = rf.score(X_test, y_test)
print("Testing Accuracy:", testrfAcc)

## Boosting - AdaBoost 

In [None]:
# Create a stump model
stump = DecisionTreeClassifier(criterion='entropy',
                                    max_depth=1)

# Train the stump classifier
stump.fit(X_train, y_train)

# Evaluate the performance
stumptrainAcc = stump.score(X_train, y_train)
print("Training Accuracy:", stumptrainAcc)
stumptestAcc = stump.score(X_test, y_test)
print("Accuracy:", stumptestAcc)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize the AdaBoost classifier
ada_clf = AdaBoostClassifier(base_estimator=stump,
                             n_estimators=100,
                             learning_rate=0.15,
                             random_state=42)

# Train the AdaBoost classifier
ada_clf.fit(X_train, y_train)

# Evaluate the performance
adatrainAcc = ada_clf.score(X_train, y_train)
print("Training Accuracy:", adatrainAcc)
adatestAcc = ada_clf.score(X_test, y_test)
print("Accuracy:", adatestAcc)

## Stacking 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

In [None]:
base_models = [
    ('KNN', make_pipeline(MinMaxScaler(), KNeighborsClassifier())),
    ('SVC', make_pipeline(MinMaxScaler(), SVC())),
    ('Adaboost', AdaBoostClassifier()),
    ('RF', RandomForestClassifier())
    ]
stacked = StackingClassifier(
    estimators = base_models,
    final_estimator = LogisticRegression(),
    cv = 10)

In [None]:
for name, model in base_models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    
    trainacc = model.score(X_train, y_train)
    acc = model.score(X_test, y_test)
    f1 = f1_score(y_test, prediction, average='weighted')
    
    print("-------{}-------".format(name))
    print("Training Accuracy:",trainacc)
    print("Testing Accuracy:",acc)
    print("F1-score:",f1)
    print("----------------------------------\n")

stacked.fit(X_train, y_train)    
stacked_prediction = stacked.predict(X_test)

stacked_trainacc = stacked.score(X_train, y_train)
stacked_acc = stacked.score(X_test, y_test)
stacked_f1 = f1_score(y_test, stacked_prediction, average='weighted')
print("-------Stacked Ensemble-------")
print("Training Accuracy: {}".format(stacked_trainacc))
print("Testing Accuracy: {}".format(stacked_acc))
print("F1-score: {}".format(stacked_f1))
print("----------------------------------")