# Custom Bagging Application

### *Standard classification and aggregation methods*

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler


In [9]:
#import data 
data = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=0)


In [10]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (455, 30)
X_test shape: (114, 30)


In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
#logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred, average='weighted')}")


    0   1
0  45   2
1   2  65
Accuracy: 0.9649122807017544
f1_score: 0.9649122807017544


In [13]:
# AdaBoost with 7 logistic regression base learner experts and compare results.
ada = AdaBoostClassifier(n_estimators=7, random_state=0, estimator = logreg)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred, average='weighted')}")


    0   1
0  44   3
1   2  65
Accuracy: 0.956140350877193
f1_score: 0.9560669894569158


In [14]:
ada = AdaBoostClassifier(n_estimators=3, random_state=0, estimator = logreg)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred, average='weighted')}")

ada = AdaBoostClassifier(n_estimators=10, random_state=0, estimator = logreg)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred, average='weighted')}")

#10 is better 

    0   1
0  41   6
1   0  67
Accuracy: 0.9473684210526315
f1_score: 0.9467019822282979
    0   1
0  44   3
1   2  65
Accuracy: 0.956140350877193
f1_score: 0.9560669894569158


### *Custom-made Ensemble model*

In [15]:
# import SVC, KNN
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

dataa = pd.DataFrame(data.data)
dataa.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [16]:
dataa.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [17]:
def train_val_test_split(x, y):
    X_train, X_, y_train, y_ = train_test_split(x, y, train_size=0.8)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(data.data, data.target)


print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}\n")

print (f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (455, 30), y_train: (455,)
X_val: (57, 30), y_val: (57,)
X_test: (57, 30), y_test: (57,)

Sample point: [1.268e+01 2.384e+01 8.269e+01 4.990e+02 1.122e-01 1.262e-01 1.128e-01
 6.873e-02 1.905e-01 6.590e-02 4.255e-01 1.178e+00 2.927e+00 3.646e+01
 7.781e-03 2.648e-02 2.973e-02 1.290e-02 1.635e-02 3.601e-03 1.709e+01
 3.347e+01 1.118e+02 8.883e+02 1.851e-01 4.061e-01 4.024e-01 1.716e-01
 3.383e-01 1.031e-01] → 0


In [18]:
scaled_X_train = StandardScaler().fit_transform(X_train)
scaled_X_val = StandardScaler().fit_transform(X_val)
scaled_X_test = StandardScaler().fit_transform(X_test)

In [19]:
#define a softmax function using just numpy that takes a vector as input and returns a vector of probabilities
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)


In [20]:
class Ensembler():
    def __init__(self): 
        self.clf_gini = DecisionTreeClassifier(random_state=0, criterion='gini')
        self.clf_entropy = DecisionTreeClassifier(random_state=0, criterion='entropy')
        self.clf_rbf = SVC(random_state=0, kernel='rbf')
        self.clf_poly = SVC(random_state=0, kernel='poly', degree=3)
        self.clf_logreg = LogisticRegression(random_state=0)
        self.clf_knn = KNeighborsClassifier(n_neighbors=5)
        self.weights = np.array([0, 0, 0, 0, 0, 0]) #initialise weights


    def fit(self, X_train, X_test, y_train, y_test):
        self.clf_gini.fit(X_train, y_train)
        self.clf_entropy.fit(X_train, y_train)
        self.clf_rbf.fit(X_train, y_train)
        self.clf_poly.fit(X_train, y_train)
        self.clf_logreg.fit(X_train, y_train)
        self.clf_knn.fit(X_train, y_train)

        #make predictions 
        y_pred_gini = self.clf_gini.predict(X_test)
        y_pred_entropy = self.clf_entropy.predict(X_test)
        y_pred_rbf = self.clf_rbf.predict(X_test)
        y_pred_poly = self.clf_poly.predict(X_test)
        y_pred_logreg = self.clf_logreg.predict(X_test)
        y_pred_knn = self.clf_knn.predict(X_test)
        preds = [y_pred_gini, y_pred_entropy, y_pred_rbf, y_pred_poly, y_pred_logreg, y_pred_knn]

        #calculate accuracies of experts
        accs = []
        for clf in (self.clf_gini, self.clf_entropy, self.clf_rbf, self.clf_poly, self.clf_logreg, self.clf_knn):
            accs.append(accuracy_score(y_test, clf.predict(X_test)))
        print(f"Accuracies: {accs}")
        
        #update weights
        self.weights = softmax(accs)
        print(f"Weights: {self.weights}")


    def predict(self, X_test):
        y_pred_gini = self.clf_gini.predict(X_test)
        y_pred_entropy = self.clf_entropy.predict(X_test)
        y_pred_rbf = self.clf_rbf.predict(X_test)
        y_pred_poly = self.clf_poly.predict(X_test)
        y_pred_logreg = self.clf_logreg.predict(X_test)
        y_pred_knn = self.clf_knn.predict(X_test)
        preds = [y_pred_gini, y_pred_entropy, y_pred_rbf, y_pred_poly, y_pred_logreg, y_pred_knn]
        print(f"Individual expert predictions: {preds}\n")

        #get the weighted average of the predictions
        w_preds = np.average(preds, axis=0, weights=self.weights)
        print(f"Weighted predictions: {w_preds}\n")
        print(f"Rounded weighted predictions: {np.round(w_preds)}")
        return w_preds.round()



    

In [21]:
ens = Ensembler()

In [22]:
ens.fit(scaled_X_train, scaled_X_test, y_train, y_test)

Accuracies: [0.8596491228070176, 0.8421052631578947, 0.8771929824561403, 0.9298245614035088, 0.8771929824561403, 0.8771929824561403]
Weights: [0.16370895 0.16086191 0.16660638 0.17561    0.16660638 0.16660638]


In [23]:
final_preds = ens.predict(scaled_X_val)

Individual expert predictions: [array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0]), array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0]), array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0]), array([1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0]), array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0])

In [24]:
final_acc = accuracy_score(y_val, final_preds)

In [25]:
cm = confusion_matrix(y_val, final_preds)
print(pd.DataFrame(cm), "\n")
print(f"Final accuracy: {round(final_acc*100, 2)}%")
print(f"Final f1_score: {round(f1_score(y_val, final_preds, average='weighted')*100, 2)}%")

    0   1
0  23   2
1   0  32 

Final accuracy: 96.49%
Final f1_score: 96.47%
