In [43]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import numpy as np
import matplotlib.pyplot as plt

In [44]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=500, random_state=1)
y = np.where(y==0,-1,1)  #change our y to be -1 if it is 0, otherwise 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)
print(X_train.shape)

(350, 20)


In [45]:
class DecisionStump():
    def __init__(self):
        # Determines whether threshold should be evaluated as < or >
        self.polarity = 1
        self.feature_index = None
        self.threshold = None
        # Voting power of the stump
        self.alpha = None

In [1]:
class AdaBoost():
    def __init__(self, S=5, eta=0.5):
        self.S = S
        self.eta = eta
        
    def fit(self, X, y): #<----X_train, y_train
        m, n = X.shape
        
        W = np.full(m, 1/m)
                
        self.clfs = []
        
        for _ in range(self.S):
            clf = DecisionStump()
            
            min_err = np.inf

            for feature in range(n):
                feature_vals = np.sort(np.unique(X[:, feature]))
                thresholds = (feature_vals[:-1] + feature_vals[1:])/2
                for threshold in thresholds:
                    for polarity in [1, -1]:
                        yhat = np.ones(len(y)) #set all to 1
                        yhat[polarity * X[:, feature] < polarity * threshold] = -1  
                        err = W[(yhat != y)].sum()
                                        
                        #save the best stump
                        if err < min_err:
                            clf.polarity = polarity
                            clf.threshold = threshold
                            clf.feature_index = feature
                            min_err = err
#                             print("hello")
        
            #once we know which is the best stump
            #we calculate its alpha, and reweight samples
            eps = 1e-10 #to prevent division by zero
            clf.alpha = self.eta * (np.log ((1 - min_err) / (min_err + eps)))            
            W = W * np.exp(-clf.alpha * y * yhat)
            W = W / sum (W)

            #save clf
            self.clfs.append(clf)
            print(f"alpha for tree{_ + 1} is",clf.alpha)
        
    def predict(self, X):
        m, n = X.shape
        yhat = np.zeros(m)
        for clf in self.clfs:
            pred = np.ones(m) #set all to 1
            pred[clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold] = -1 
            yhat += clf.alpha * pred
#             print((yhat))

        return np.sign(yhat)

In [55]:
model = AdaBoost(S=10)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print(classification_report(y_test, yhat))

alpha for tree1 is 1.4573816045379397
alpha for tree2 is 1.9577842153322087
alpha for tree3 is 3.8150839497502074
alpha for tree4 is 7.62962075137483
alpha for tree5 is 11.512647073166862
alpha for tree6 is 11.5129254649702
alpha for tree7 is 11.512925464970229
alpha for tree8 is 11.512925464970229
alpha for tree9 is 11.512925464970229
alpha for tree10 is 11.512925464970229
              precision    recall  f1-score   support

          -1       0.72      0.99      0.83        79
           1       0.98      0.56      0.71        71

    accuracy                           0.79       150
   macro avg       0.85      0.78      0.77       150
weighted avg       0.84      0.79      0.78       150

