In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [3]:
import numpy as np

data = pd.DataFrame({
    "Gender": ["Female", "Male", "Male", "Female", "Male"],
    "Age": [33, 57, 41, 49, 36],
    "Income": [480000, 320000, 900000, 540000, 450000],
    "BMI": [22.1, 29.5, 24.0, 31.2, 27.8],
    "Illness": [-1, +1, -1, +1, -1]   # No = -1, Yes = +1
})

# Initial weights
weights = np.array([0.2] * 5)

data

Unnamed: 0,Gender,Age,Income,BMI,Illness
0,Female,33,480000,22.1,-1
1,Male,57,320000,29.5,1
2,Male,41,900000,24.0,-1
3,Female,49,540000,31.2,1
4,Male,36,450000,27.8,-1


In [4]:
def bmi_stump(x):
    return 1 if x >= 25 else -1

In [5]:
def age_stump(x):
    return 1 if x >= 45 else -1


In [6]:
def weighted_error(predictions, y, w):
    return np.sum(w[predictions != y])


In [7]:
def weighted_error(predictions, y, w):
    return np.sum(w[predictions != y])

In [8]:
y = data["Illness"].values
bmi_preds = data["BMI"].apply(bmi_stump).values

epsilon1 = weighted_error(bmi_preds, y, weights)
epsilon1


np.float64(0.2)

In [9]:
alpha1 = 0.5 * np.log((1 - epsilon1) / epsilon1)
alpha1

np.float64(0.6931471805599453)

In [10]:
weights = weights * np.exp(-alpha1 * y * bmi_preds)
weights = weights / np.sum(weights)   # normalize
weights


array([0.125, 0.125, 0.125, 0.125, 0.5  ])

In [11]:
age_preds = data["Age"].apply(age_stump).values

epsilon2 = weighted_error(age_preds, y, weights)
epsilon2

np.float64(0.0)

In [12]:
alpha2 = 0.5 * np.log((1 - epsilon2) / epsilon2)
alpha2


np.float64(inf)

In [13]:
weights = weights * np.exp(-alpha2 * y * age_preds)
weights = weights / np.sum(weights)
weights

array([nan, nan, nan, nan, nan])

In [14]:
final_score = alpha1 * bmi_preds + alpha2 * age_preds
final_prediction = np.sign(final_score)

final_prediction

array([-1.,  1., -1.,  1., -1.])

In [15]:
pd.DataFrame({
    "True y": y,
    "Predicted y": final_prediction
})

Unnamed: 0,True y,Predicted y
0,-1,-1.0
1,1,1.0
2,-1,-1.0
3,1,1.0
4,-1,-1.0


In [17]:
import math

def adaboost_round(predictions, y, weights):
    error = np.sum(weights * (predictions != y))
    error = max(error, 1e-10)
    alpha = 0.5 * math.log((1 - error) / error)


    new_weights = weights * np.exp(-alpha * y * predictions)
    new_weights = new_weights / np.sum(new_weights)

    return alpha, new_weights


y = data["Illness"].values

In [21]:
print("\nROUND 2")

# Initialize alphas and stumps with results from Round 1
alphas = [alpha1]
stumps = [("bmi", bmi_preds)]

# Re-initialize weights to the state after the first round's correct update
# (from cell mq5bP9Xb-tWD before it was overwritten by nan values)
weights = np.array([0.125, 0.125, 0.125, 0.125, 0.5])

# Corrected stump_age to age_stump and df to data
pred2 = data["Age"].apply(age_stump).values
alpha2, weights = adaboost_round(pred2, y, weights)

alphas.append(alpha2)
stumps.append(("age", pred2))

print("Alpha2:", alpha2)
print("Weights after Round2:", weights)


ROUND 2
Alpha2: 11.512925464920228
Weights after Round2: [0.125 0.125 0.125 0.125 0.5  ]


In [24]:
# Define stump_income function
def stump_income(x):
    # Using a threshold for Income, for example, 500000
    return 1 if x >= 500000 else -1

print("\nROUND 3")

# Using 'Income' column as a substitute for 'Smoking'
pred3 = data["Income"].apply(stump_income).values
alpha3, weights = adaboost_round(pred3, y, weights)

alphas.append(alpha3)
stumps.append(("income", pred3))

print("Alpha3:", alpha3)
print("Weights after Round3:", weights)


ROUND 3
Alpha3: 0.5493061443340549
Weights after Round3: [0.08333333 0.25       0.25       0.08333333 0.33333333]


In [25]:
print("\nFINAL PREDICTIONS")

# Initialize final_scores with zeros, matching the shape of y
final_scores = np.zeros_like(y, dtype=float)

# Iterate through alphas and stumps to sum up contributions
for alpha, (_, pred_array) in zip(alphas, stumps):
    final_scores += alpha * pred_array

final_pred = np.sign(final_scores)

results = pd.DataFrame({
    "True": y,
    "Score": final_scores,
    "Predicted": final_pred
})

print(results)


FINAL PREDICTIONS
   True      Score  Predicted
0    -1 -12.755379       -1.0
1     1  11.656767        1.0
2    -1 -11.656767       -1.0
3     1  12.755379        1.0
4    -1 -11.369084       -1.0


In [26]:
accuracy = np.mean(final_pred == y)
print("\nFinal Accuracy:", accuracy * 100, "%")


Final Accuracy: 100.0 %
