# Ensemble Methods & Boosting — Student Lab

Week 4 introduces sklearn models, but you must still explain *why* they work (bias/variance).

In [9]:
import numpy as np
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def check(name: str, cond: bool):
    if not cond:
        raise AssertionError(f'Failed: {name}')
    print(f'OK: {name}')

rng = np.random.default_rng(0)

## Section 0 — Dataset (synthetic default, real optional)

### Task 0.1: Choose dataset
Use synthetic by default. Optionally switch to breast cancer dataset.

# TODO: set `use_real = False` or True

In [10]:
use_real = False  # TODO

if use_real:
    data = load_breast_cancer()
    X = data.data
    y = data.target
else:
    X, y = make_classification(
        n_samples=2000,
        n_features=20,
        n_informative=8,
        n_redundant=4,
        class_sep=1.0,
        flip_y=0.03,
        random_state=0,
    )

Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
check('shapes', Xtr.shape[0]==ytr.shape[0] and Xva.shape[0]==yva.shape[0])
Xtr.shape

OK: shapes


(1400, 20)

## Section 1 — Baseline vs Trees vs Random Forest

### Task 1.1: Train baseline decision tree vs random forest

# TODO: Train:
- DecisionTreeClassifier(max_depth=?)
- RandomForestClassifier(n_estimators=?, max_depth=?, oob_score=True, bootstrap=True)

Compute accuracy + ROC-AUC on validation.

**Checkpoint:** Why does bagging reduce variance?

In [25]:
# TODO

def eval_model(clf, X, y):
    pred = clf.predict(X)
    acc = accuracy_score(y, pred)
    # many sklearn classifiers have predict_proba; handle if not
    if hasattr(clf, 'predict_proba'):
        proba = clf.predict_proba(X)[:, 1]
        auc = roc_auc_score(y, proba)
    else:
        auc = float('nan')
    return acc, auc

max_depths = [2,10,20,50]
num_trees = [1,25,100,500]

for max_depth in max_depths:
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=0)
    for n_trees in num_trees:
        rf = RandomForestClassifier(
            n_estimators=n_trees, max_depth=max_depth,
            oob_score=True, bootstrap=True, random_state=0
        )
        tree.fit(Xtr, ytr)
        rf.fit(Xtr, ytr)
        print(f"Depth {max_depth}, n_trees: {n_trees}")
        print('tree', eval_model(tree, Xva, yva))
        print('rf  ', eval_model(rf, Xva, yva))
        if hasattr(rf, 'oob_score_'):
            print('rf oob_score', rf.oob_score_)

  warn(


Depth 2, n_trees: 1
tree (0.78, np.float64(0.8372999999999999))
rf   (0.79, np.float64(0.8346333333333332))
rf oob_score 0.5971428571428572
Depth 2, n_trees: 25
tree (0.78, np.float64(0.8372999999999999))
rf   (0.8233333333333334, np.float64(0.9038444444444445))
rf oob_score 0.8414285714285714
Depth 2, n_trees: 100
tree (0.78, np.float64(0.8372999999999999))
rf   (0.8116666666666666, np.float64(0.8922888888888889))
rf oob_score 0.845
Depth 2, n_trees: 500
tree (0.78, np.float64(0.8372999999999999))
rf   (0.8216666666666667, np.float64(0.8971222222222223))
rf oob_score 0.8571428571428571
Depth 10, n_trees: 1
tree (0.85, np.float64(0.8507499999999999))
rf   (0.795, np.float64(0.7982277777777778))
rf oob_score 0.6264285714285714


  warn(


Depth 10, n_trees: 25
tree (0.85, np.float64(0.8507499999999999))
rf   (0.875, np.float64(0.9435611111111111))
rf oob_score 0.905
Depth 10, n_trees: 100
tree (0.85, np.float64(0.8507499999999999))
rf   (0.885, np.float64(0.9474666666666666))
rf oob_score 0.9107142857142857
Depth 10, n_trees: 500
tree (0.85, np.float64(0.8507499999999999))
rf   (0.8833333333333333, np.float64(0.9502222222222222))
rf oob_score 0.9092857142857143


  warn(


Depth 20, n_trees: 1
tree (0.85, np.float64(0.85))
rf   (0.8066666666666666, np.float64(0.8066666666666668))
rf oob_score 0.6192857142857143
Depth 20, n_trees: 25
tree (0.85, np.float64(0.85))
rf   (0.8883333333333333, np.float64(0.9501555555555555))
rf oob_score 0.8892857142857142
Depth 20, n_trees: 100
tree (0.85, np.float64(0.85))
rf   (0.8833333333333333, np.float64(0.9495444444444444))
rf oob_score 0.9092857142857143
Depth 20, n_trees: 500
tree (0.85, np.float64(0.85))
rf   (0.88, np.float64(0.9524555555555555))
rf oob_score 0.9107142857142857
Depth 50, n_trees: 1


  warn(


tree (0.85, np.float64(0.85))
rf   (0.8066666666666666, np.float64(0.8066666666666668))
rf oob_score 0.6192857142857143
Depth 50, n_trees: 25
tree (0.85, np.float64(0.85))
rf   (0.8883333333333333, np.float64(0.9501555555555555))
rf oob_score 0.8892857142857142
Depth 50, n_trees: 100
tree (0.85, np.float64(0.85))
rf   (0.8833333333333333, np.float64(0.9495444444444444))
rf oob_score 0.9092857142857143
Depth 50, n_trees: 500
tree (0.85, np.float64(0.85))
rf   (0.88, np.float64(0.9524444444444444))
rf oob_score 0.91


### Task 1.2: Feature importance gotcha

Inspect `feature_importances_` and explain why correlated features can distort importances.

# TODO: print top 10 features by importance.

In [22]:
# TODO
imp = rf.feature_importances_
top = np.argsort(-imp)[:10]
print('top idx', top)
print('top importances', imp[top])

top idx [ 3 12 15 17  7 11  4 18  1 13]
top importances [0.13177789 0.12478299 0.11547313 0.11104177 0.09557517 0.09329674
 0.05479175 0.03673158 0.03283524 0.03067371]


## Section 2 — Gradient Boosting

### Task 2.1: Train GradientBoostingClassifier

# TODO: Train GB with different n_estimators and learning_rate and compare.

**Checkpoint:** Why can boosting overfit with too many estimators?

In [23]:
settings = [
    {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 2},
    {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 2},
    {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 2},
]

for s in settings:
    gb = GradientBoostingClassifier(random_state=0, **s)
    gb.fit(Xtr, ytr)
    print('gb', s, eval_model(gb, Xva, yva))

gb {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 2} (0.8616666666666667, np.float64(0.9393722222222223))
gb {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 2} (0.8883333333333333, np.float64(0.9497666666666666))
gb {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 2} (0.8833333333333333, np.float64(0.9481888888888889))


## Section 3 — XGBoost-style knobs (conceptual)

### Task 3.1: Explain what each knob does
Write 2-3 bullets each:
- subsample
- colsample
- learning rate
- max_depth

- **subsample:** Fraction of datapoints/training rows used for trianing. this helps avoid some outliers affecting all the trees
- **colsample:** : Fraction of features used for each tree. This helps avoid using useless features/co-related features for easier generalizability
- **learning_rate:** Rate of change of weights -> controlling how fast the system can learn. If too low, might take too long to learn. If too high, the leanring might not converge or get sensitive to outliers
- **max_depth:** : Maximum depth of tree - if too high, then tree can heavily overfit and if it's too low, then the tree might not learn enough.

---
## Submission Checklist
- All TODOs completed
- Baseline vs RF vs GB compared
- OOB score discussed (if available)
- Feature importance gotcha explained