## Run below code cell to load dataset to colab

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Scale features for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## Task 1.1
### Train an AdaBoostClassifier with n_estimators=1 (just one tree).

## Task 1.2
### Train an AdaBoostClassifier with n_estimators=50 (multiple trees).

## Task 1.3
### Compare both of them , which one is better.

In [6]:
# AdaBoost with n_estimators=1 (essentially a single weak learner - decision stump by default)
ada_single = AdaBoostClassifier(n_estimators=1, random_state=42)
ada_single.fit(X_train, y_train)
y_pred_single = ada_single.predict(X_test)
acc_single = accuracy_score(y_test, y_pred_single)
print(f"AdaBoost (1 estimator) Test Accuracy: {acc_single:.4f}")

# AdaBoost with n_estimators=50
ada_multi = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_multi.fit(X_train, y_train)
y_pred_multi = ada_multi.predict(X_test)
acc_multi = accuracy_score(y_test, y_pred_multi)
print(f"AdaBoost (50 estimators) Test Accuracy: {acc_multi:.4f}")

AdaBoost (1 estimator) Test Accuracy: 0.8947
AdaBoost (50 estimators) Test Accuracy: 0.9649


## Task 2.1
### Train a model with Gradient Boost with both low and high learning rate.

## Task 2.2
### Compare  "Fast Learner" (high learning rate) vs. a "Slow Learner" (low learning rate).

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# "Fast Learner" - high learning rate
gb_fast = GradientBoostingClassifier(learning_rate=1.0, n_estimators=100, random_state=42)
gb_fast.fit(X_train, y_train)
y_pred_fast = gb_fast.predict(X_test)
acc_fast = accuracy_score(y_test, y_pred_fast)
print(f"GB Fast Learner (lr=1.0) Test Accuracy: {acc_fast:.4f}")

# "Slow Learner" - low learning rate
gb_slow = GradientBoostingClassifier(learning_rate=0.01, n_estimators=100, random_state=42)
gb_slow.fit(X_train, y_train)
y_pred_slow = gb_slow.predict(X_test)
acc_slow = accuracy_score(y_test, y_pred_slow)
print(f"GB Slow Learner (lr=0.01) Test Accuracy: {acc_slow:.4f}")

GB Fast Learner (lr=1.0) Test Accuracy: 0.9649
GB Slow Learner (lr=0.01) Test Accuracy: 0.9561


## Task 3.2
### Train a model with XGBoost with both shallow and deep tree

## Task 3.2
### Compare a "Shallow" tree (depth=2) vs. a "Deep" tree (depth=15).

In [8]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Shallow trees
xgb_shallow = xgb.XGBClassifier(max_depth=2, n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_shallow.fit(X_train, y_train)
y_pred_shallow = xgb_shallow.predict(X_test)
acc_shallow = accuracy_score(y_test, y_pred_shallow)
print(f"XGBoost Shallow (depth=2) Test Accuracy: {acc_shallow:.4f}")

# Deep trees
xgb_deep = xgb.XGBClassifier(max_depth=15, n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_deep.fit(X_train, y_train)
y_pred_deep = xgb_deep.predict(X_test)
acc_deep = accuracy_score(y_test, y_pred_deep)
print(f"XGBoost Deep (depth=15) Test Accuracy: {acc_deep:.4f}")

XGBoost Shallow (depth=2) Test Accuracy: 0.9649
XGBoost Deep (depth=15) Test Accuracy: 0.9561


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Reduced grid for faster search (still powerful - 432 combinations â†’ ~5-15 minutes depending on hardware)
param_grid = {
    'max_depth': [3, 5, 7],              # Common good values
    'learning_rate': [0.01, 0.1, 0.3],   # Low, medium, high
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 5],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1.0]
}

# Note: removed use_label_encoder=False (deprecated and unnecessary)
xgb_model = xgb.XGBClassifier(
    random_state=42,
    eval_metric='logloss',   # Still good to silence warnings
    objective='binary:logistic'
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1  # Optional: shows progress
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
best_xgb = grid_search.best_estimator_
y_pred_best = best_xgb.predict(X_test)
acc_best = accuracy_score(y_test, y_pred_best)
print(f"Best XGBoost Test Accuracy: {acc_best:.4f}")

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 