In [3]:

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)
y_train_dt_full = dt_full.predict(X_train)
y_test_dt_full = dt_full.predict(X_test)

dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train, y_train)
y_train_dt_pruned = dt_pruned.predict(X_train)
y_test_dt_pruned = dt_pruned.predict(X_test)

print(f"Full Train: {accuracy_score(y_train, y_train_dt_full):.3f}, Test: {accuracy_score(y_test, y_test_dt_full):.3f}")
print(f"Pruned Train: {accuracy_score(y_train, y_train_dt_pruned):.3f}, Test: {accuracy_score(y_test, y_test_dt_pruned):.3f}")


rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_train_rf = rf.predict(X_train)
y_test_rf = rf.predict(X_test)

print(f"Random Train: {accuracy_score(y_train, y_train_rf):.3f}, Random Test: {accuracy_score(y_test, y_test_rf):.3f}")

rf_importance = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
print("Top 5 :", rf_importance.head(5))


gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_train_gb = gb.predict(X_train)
y_test_gb = gb.predict(X_test)

print(f"Gradient Boosting Accuracy Train: {accuracy_score(y_train, y_train_gb):.3f}, Test: {accuracy_score(y_test, y_test_gb):.3f}")
learning_rates = [0.01, 0.1]
n_estimators_list = [50, 100, 200]

print(" Gradient Boosting Hyperparameter Sweep ")
print("lr | n_estimators | train_acc | test_acc")
for lr in learning_rates:
    for n_est in n_estimators_list:
        gb = GradientBoostingClassifier(learning_rate=lr, n_estimators=n_est, random_state=42)
        gb.fit(X_train, y_train)
        train_acc = accuracy_score(y_train, gb.predict(X_train))
        test_acc = accuracy_score(y_test, gb.predict(X_test))
        print(f"{lr:<4} | {n_est:<12} | {train_acc:.3f}    | {test_acc:.3f}")

importance = pd.Series(gb.feature_importances_, index=feature_names).sort_values(ascending=False)
print("Top 5 ", importance.head(5))


Full Train: 1.000, Test: 0.912
Pruned Train: 0.976, Test: 0.939
Random Train: 1.000, Random Test: 0.956
Top 5 : worst area              0.140016
worst concave points    0.129530
worst radius            0.097696
mean concave points     0.090885
worst perimeter         0.072226
dtype: float64
Gradient Boosting Accuracy Train: 1.000, Test: 0.956
 Gradient Boosting Hyperparameter Sweep 
lr | n_estimators | train_acc | test_acc
0.01 | 50           | 0.976    | 0.939
0.01 | 100          | 0.987    | 0.921
0.01 | 200          | 0.993    | 0.930
0.1  | 50           | 1.000    | 0.947
0.1  | 100          | 1.000    | 0.956
0.1  | 200          | 1.000    | 0.956
Top 5  worst radius            0.435471
worst perimeter         0.271465
worst concave points    0.106543
worst texture           0.052636
mean concave points     0.030458
dtype: float64
