
# Module 5 — In-Class Activity (Solution)
**Topic:** Ensemble Learning in Practice 
This notebook contains the **completed version** of the in-class ensemble learning activity.


In [None]:

# ============================================
# Step 1. Imports and setup
# ============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

import warnings
warnings.filterwarnings("ignore")
np.random.seed(7)



## Step 2. Generate a simple dataset


In [None]:

n = 240

DraftsSubmitted    = np.random.randint(0, 6,  n)
PeerReviewsGiven   = np.random.randint(0, 11, n)
MeetingsWithTA     = np.random.randint(0, 7,  n)
OnTimeSubmissions  = np.random.randint(0, 11, n)
WeekendCodingHours = np.random.randint(0, 16, n)

df = pd.DataFrame({
    "DraftsSubmitted": DraftsSubmitted,
    "PeerReviewsGiven": PeerReviewsGiven,
    "MeetingsWithTA": MeetingsWithTA,
    "OnTimeSubmissions": OnTimeSubmissions,
    "WeekendCodingHours": WeekendCodingHours
})

# Label generation
score = (
    1.0 * DraftsSubmitted
  + 0.6 * (PeerReviewsGiven / 2.0)
  + 0.9 * MeetingsWithTA
  + 0.8 * (OnTimeSubmissions / 2.0)
  + 0.3 * (WeekendCodingHours / 3.0)
)
prob = 1.0 / (1.0 + np.exp(-0.9 * (score - 6.5)))
df["HighGrade"] = (prob > 0.55).astype(int)

display(df.head())
print("HighGrade rate:", df["HighGrade"].mean().round(3))



## Step 3. Split and scale the data


In [None]:

X = df.drop(columns=["HighGrade"])
y = df["HighGrade"]

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=7, stratify=y
)

scaler = StandardScaler()
X_tr_sc = scaler.fit_transform(X_tr)
X_te_sc = scaler.transform(X_te)



## Step 4. Train baseline models


In [None]:

results = {}

# Logistic Regression
lr = LogisticRegression(max_iter=500, random_state=7)
lr.fit(X_tr_sc, y_tr)
pred_lr = lr.predict(X_te_sc)
results["Logistic Regression"] = {
    "Accuracy": accuracy_score(y_te, pred_lr),
    "F1": f1_score(y_te, pred_lr)
}

# Decision Tree
tree = DecisionTreeClassifier(max_depth=5, random_state=7)
tree.fit(X_tr, y_tr)
pred_tree = tree.predict(X_te)
results["Decision Tree"] = {
    "Accuracy": accuracy_score(y_te, pred_tree),
    "F1": f1_score(y_te, pred_tree)
}

# Random Forest
rf = RandomForestClassifier(n_estimators=250, random_state=7)
rf.fit(X_tr, y_tr)
pred_rf = rf.predict(X_te)
results["Random Forest"] = {
    "Accuracy": accuracy_score(y_te, pred_rf),
    "F1": f1_score(y_te, pred_rf)
}

pd.DataFrame(results).T



## Step 5. Train ensemble models


In [None]:

# Bagging (Tree)
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=5, random_state=7),
    n_estimators=200,
    random_state=7
)
bag.fit(X_tr, y_tr)
pred_bag = bag.predict(X_te)
results["Bagging (Tree)"] = {
    "Accuracy": accuracy_score(y_te, pred_bag),
    "F1": f1_score(y_te, pred_bag)
}

# AdaBoost (shallow trees)
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=2, random_state=7),
    n_estimators=300,
    learning_rate=0.5,
    random_state=7
)
ada.fit(X_tr, y_tr)
pred_ada = ada.predict(X_te)
results["AdaBoost"] = {
    "Accuracy": accuracy_score(y_te, pred_ada),
    "F1": f1_score(y_te, pred_ada)
}

pd.DataFrame(results).T.sort_values("F1", ascending=False)



## Step 6. Visualize comparison


In [None]:

comparison = pd.DataFrame(results).T.sort_values("F1", ascending=False)

plt.figure(figsize=(7, 4))
plt.barh(comparison.index, comparison["F1"], alpha=0.85, color="teal")
plt.xlabel("F1 Score")
plt.title("Model Comparison: Project Habits Dataset")
plt.grid(axis="x", alpha=0.35)
plt.show()

comparison



### Interpretation

- **Random Forest** typically performs best because it combines many decorrelated trees, reducing variance.  
- **Bagging** improves Decision Tree stability but not as strongly as Random Forest.  
- **AdaBoost** performs well when weak learners (shallow trees) complement each other sequentially by focusing on prior mistakes.  
- Logistic Regression may underperform if nonlinear relationships exist.  
- Comparing F1 scores highlights that ensemble techniques generally outperform single models in this noisy, human-like dataset.
