# **_Boosting Algorithm ([AdaBoost](https://www.youtube.com/watch?v=LsK-xG1cLYA))_**
### __*Learning a set of methods on weighted examples*__

<br>

### __*Import Libraries*__

In [1]:
import DataLoader
import numpy as np
import pandas as pd

<br>

### __*Decision Stump*__

In [2]:
class DecisionStump:
    def __init__(self):
        self.polarity = 1           # It is a positive or negative sample
        self.feature_idx = None     # Which Feature will this Decision Stump use
        self.threshold = None       # The amount by which we choose
        self.alpha = None           # How important this Decision Stump is final prediction

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions

<br>

### __*AdaBoost Model*__

In [3]:
class AdaBoost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        self.clfs = []

    def fit(self, X, y):
        """ Train the AdaBoost Model """
        n_samples, n_features = X.shape

        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))

        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float("inf")

            # Greedy search to find the best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)

                for idx in range(len(thresholds) - 1):
                    # Calculate the threshold
                    threshold = (thresholds[idx] + thresholds[idx + 1]) / 2

                    # Predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1

                    # Error = sum of misclassified samples weight
                    error = sum(w[y != predictions])

                    # Swap polarity & fix error
                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # Save the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error

            # Calculate alpha (How much impact this rule has on final output)
            EPS = 1e-10  # Make sure we never divide by 0
            clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

            # Calculate predictions
            predictions = clf.predict(X)

            # Update weights & normalization to one
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w)

            # Save classifier
            self.clfs.append(clf)

    def predict(self, X):
        """Create List of predictions for every classifier, sum them & normalize"""
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_prediction = np.sum(clf_preds, axis=0)
        y_prediction = np.sign(y_prediction)

        return y_prediction

    def show_classification(self, Table, Translate):
        Index = 0
        for clf in self.clfs:
            Index += 1
            if Index < 10:
                print(f'0{Index}. Decision: {Table.columns[clf.feature_idx]} | '
                      f'Threshold: {"Less Than" if clf.polarity == -1 else "Greater Than"} '
                      f'{round(clf.threshold, 3)} is {Translate[1]} | Power of Rule: {round(clf.alpha, 3)}')
            else:
                print(f'{Index}. Decision: {Table.columns[clf.feature_idx]} | '
                      f'Threshold: {"Less Than" if clf.polarity == -1 else "Greater Than"} '
                      f'{round(clf.threshold, 3)} is {Translate[1]} | Power of Rule: {round(clf.alpha, 3)}')

<br>

### __*Final Tests*__

In [4]:
class ModelTesting:
    def __init__(self, X_Train, y_Train, X_Test, y_Test, Model=None, y_Prediction=None):
        self.X_Train = X_Train
        self.y_Train = y_Train
        self.X_Test = X_Test
        self.y_Test = y_Test
        self.Model = Model
        self.y_Prediction = y_Prediction

    def optim(self, Max_Cls=100, Stop=False, Gap=1):
        """
        Find optimal number of classifiers
        :return: Best Number of Classifiers
        """
        Max_Classifier = Max_Cls
        Last_Test = 0
        Last_Number = 0
        Drop = 0

        for num in range(1, Max_Classifier):
            Model = AdaBoost(n_clf=num)
            Model.fit(self.X_Train, self.y_Train)
            y_Prediction = Model.predict(self.X_Test)
            Test = self.accuracy(y_Prediction) + self.precision(y_Prediction)\
                   + self.recall(y_Prediction) + self.f1score(y_Prediction)

            if Stop and Test < Last_Test:
                Drop += 1
                if Drop > Gap:
                    return Last_Number
            elif Test > Last_Test:
                self.Model = Model
                self.y_Prediction = y_Prediction
                Last_Test, Last_Number = Test, num
            else:
                Drop = 0

        return Last_Number

    def accuracy(self, y_Prediction=None):
        """ Test the accuracy of model """
        y_Prediction = self.y_Prediction if y_Prediction is None else y_Prediction
        return np.sum(self.y_Test == y_Prediction) / len(self.y_Test)

    def precision(self, y_Prediction=None):
        """ Test the precision of model """
        y_Prediction = self.y_Prediction if y_Prediction is None else y_Prediction
        TP = np.sum(self.y_Test[self.y_Test == y_Prediction] == 1)
        # print(f'{y_Prediction}\n{self.y_Test}\nTP:{TP} P:{len(self.y_Test[self.y_Test == 1])}')
        return TP / len(self.y_Test[self.y_Test == 1])

    def recall(self, y_Prediction=None):
        """ Test the recall of model """
        y_Prediction = self.y_Prediction if y_Prediction is None else y_Prediction
        TP = np.sum(self.y_Test[self.y_Test == y_Prediction] == 1)
        FN = np.sum(self.y_Test[self.y_Test != y_Prediction] == -1)
        # print(f'{y_Prediction}\n{self.y_Test}\nTP:{TP} FN:{FN} TP + FN:{TP + FN}')
        return TP / (TP + FN)

    def f1score(self, y_Prediction=None):
        """ Test the f1 score of model """
        y_Prediction = self.y_Prediction if y_Prediction is None else y_Prediction
        Precision = self.precision(y_Prediction)
        Recall = self.recall(y_Prediction)
        return 2*((Precision * Recall) / (Precision + Recall))

<br>

### __*[Heart Attack Dataset](https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset)*__

In [5]:
# Show structure of dataset
heart_raw = pd.read_csv('data/heart.csv')
heart_raw.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# Prepare the dataset
dataset = DataLoader.load_dataset('data/heart.csv')
train_data, test_data, translate = DataLoader.tt_split_dataset(dataset, train=0.8, shuffle=False, data='heart')
X_train, y_train = DataLoader.xy_split_dataset(train_data)
X_test, y_test = DataLoader.xy_split_dataset(test_data)

# Show structure of new dataset
print(f'{train_data[0]}\n{train_data[-1]}\nTranslation: {translate}')

[63.0, 1.0, 3.0, 145.0, 233.0, 1.0, 0.0, 150.0, 0.0, 2.3, 0.0, 0.0, 1.0, 1]
[47.0, 1.0, 0.0, 110.0, 275.0, 0.0, 0.0, 118.0, 1.0, 1.0, 1.0, 1.0, 2.0, -1]
Translation: {1: 'Positive', -1: 'Negative'}


In [7]:
# Test the AdaBoost Model
model = ModelTesting(X_train, y_train, X_test, y_test)
n_cls = model.optim(Max_Cls=20, Stop=False, Gap=5)
acc = model.accuracy()
pre = model.precision()
rec = model.recall()
f1s = model.f1score()

# Print the Results
print(f'Nuber of classifiers: {n_cls}\n'
      f'Accuracy: {round(acc*100, 2)}% | Precision: {round(pre*100, 2)}% | '
      f'Recall: {round(rec*100, 2)}% | F1 Score: {round(f1s*100, 2)}%')

Nuber of classifiers: 5
Accuracy: 81.97% | Precision: 84.85% | Recall: 82.35% | F1 Score: 83.58%


In [8]:
# Show the decision-making process
model.Model.show_classification(heart_raw, translate)

01. Decision: cp | Threshold: Greater Than 0.5 is Positive | Power of Rule: 0.589
02. Decision: caa | Threshold: Less Than 0.5 is Positive | Power of Rule: 0.528
03. Decision: slp | Threshold: Greater Than 1.5 is Positive | Power of Rule: 0.487
04. Decision: sex | Threshold: Less Than 0.5 is Positive | Power of Rule: 0.408
05. Decision: thall | Threshold: Less Than 2.5 is Positive | Power of Rule: 0.37


<br>

### __*[Cancer Dataset](https://www.kaggle.com/datasets/erdemtaha/cancer-data)*__

In [9]:
# Show structure of dataset
cancer_raw = pd.read_csv('data/cancer.csv')
cancer_raw.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [10]:
# Prepare the dataset
dataset = DataLoader.load_dataset('data/cancer.csv')
train_data, test_data, translate = DataLoader.tt_split_dataset(dataset, train=0.8, shuffle=False, data='cancer')
X_train, y_train = DataLoader.xy_split_dataset(train_data)
X_test, y_test = DataLoader.xy_split_dataset(test_data)

# Show structure of new dataset
print(f'{train_data[0]}\n{train_data[-1]}\nTranslation: {translate}')

[17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871, 1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193, 25.38, 17.33, 184.6, 2019.0, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189, 1]
[12.27, 29.97, 77.42, 465.4, 0.07699, 0.03398, 0.0, 0.0, 0.1701, 0.0596, 0.4455, 3.647, 2.884, 35.13, 0.007339, 0.008243, 0.0, 0.0, 0.03141, 0.003136, 13.45, 38.05, 85.08, 558.9, 0.09422, 0.05213, 0.0, 0.0, 0.2409, 0.06743, -1]
Translation: {1: 'M (Positive)', -1: 'B (Negative)'}


In [11]:
# Test the AdaBoost Model
model = ModelTesting(X_train, y_train, X_test, y_test)
n_cls = model.optim(Max_Cls=20, Stop=True, Gap=1)
acc = model.accuracy()
pre = model.precision()
rec = model.recall()
f1s = model.f1score()

# Print the Results
print(f'Nuber of classifiers: {n_cls}\n'
      f'Accuracy: {round(acc*100, 2)}% | Precision: {round(pre*100, 2)}% | '
      f'Recall: {round(rec*100, 2)}% | F1 Score: {round(f1s*100, 2)}%')

Nuber of classifiers: 9
Accuracy: 96.52% | Precision: 93.02% | Recall: 97.56% | F1 Score: 95.24%


In [12]:
# Show the decision-making process
model.Model.show_classification(cancer_raw, translate)

01. Decision: radius_worst | Threshold: Greater Than 106.05 is M (Positive) | Power of Rule: 1.241
02. Decision: compactness_worst | Threshold: Greater Than 0.142 is M (Positive) | Power of Rule: 0.92
03. Decision: diagnosis | Threshold: Greater Than 20.235 is M (Positive) | Power of Rule: 0.647
04. Decision: perimeter_mean | Threshold: Greater Than 0.09 is M (Positive) | Power of Rule: 0.473
05. Decision: fractal_dimension_se | Threshold: Greater Than 23.35 is M (Positive) | Power of Rule: 0.501
06. Decision: texture_se | Threshold: Greater Than 35.185 is M (Positive) | Power of Rule: 0.572
07. Decision: smoothness_worst | Threshold: Greater Than 0.208 is M (Positive) | Power of Rule: 0.503
08. Decision: area_se | Threshold: Less Than 0.012 is M (Positive) | Power of Rule: 0.52
09. Decision: perimeter_worst | Threshold: Greater Than 0.137 is M (Positive) | Power of Rule: 0.517
