In [None]:
import pandas as pd

dataset = pd.read_csv("../datasets/penguins.csv")

feature_names = [
    "Culmen Length (mm)",
    "Culmen Depth (mm)",
    "Flipper Length (mm)",
]
target_name = "Body Mass (g)"

dataset = dataset[feature_names + [target_name]].dropna(axis="rows", how="any")
dataset = dataset.sample(frac=1, random_state=0).reset_index(drop=True)
data, target = dataset[feature_names], dataset[target_name]

In [None]:

from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state=0)

cv=10
cv_results_tree = cross_validate(tree, data, target, cv=cv, return_train_score=True)
cv_results_tree["test_score"].mean(), cv_results_tree["test_score"].std()


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0)
cv_results_rf = cross_validate(rf, data, target, cv=cv, return_train_score=True)
cv_results_rf["test_score"].mean(), cv_results_rf["test_score"].std() 
print(
    "Random forest is better than a single decision tree for "
    f"{sum(cv_results_rf['test_score'] > cv_results_tree['test_score'])} "
    "CV iterations out of 10"
) 

In [None]:
rf_5_trees = RandomForestRegressor(n_estimators=5, random_state=0)
rf_100_trees = RandomForestRegressor(n_estimators=100, random_state=0)

cv_results_rf_5_trees = cross_validate(
    rf_5_trees, data, target, cv=cv, return_train_score=True
)
cv_results_rf_100_trees = cross_validate(
    rf_100_trees, data, target, cv=cv, return_train_score=True
)
print(
    "Random forest with 100 trees is better than a random forest with 5 trees for "
    f"{sum(cv_results_rf_100_trees['test_score'] > cv_results_rf_5_trees['test_score'])} "
    "CV iterations out of 10"
)

In [None]:
print(
    "Scores for random forest with 5 trees: "
    f"train: {cv_results_rf_5_trees['train_score'].mean():.3f} +/- "
    f"{cv_results_rf_5_trees['train_score'].std():.3f}, "
    f"test: {cv_results_rf_5_trees['test_score'].mean():.3f} +/- "
    f"{cv_results_rf_5_trees['test_score'].std():.3f}"
)
print(    
    "Scores for random forest with 100 trees: "
    f"train: {cv_results_rf_100_trees['train_score'].mean():.3f} +/- "
    f"{cv_results_rf_100_trees['train_score'].std():.3f}, "
    f"test: {cv_results_rf_100_trees['test_score'].mean():.3f} +/- "
    f"{cv_results_rf_100_trees['test_score'].std():.3f}"
  )

In [None]:
import numpy as np
from sklearn.model_selection import ValidationCurveDisplay

n_estimators = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000])

disp = ValidationCurveDisplay.from_estimator(
    rf,
    data,
    target,
    param_name="n_estimators",
    param_range=n_estimators,
    scoring="r2",  # this is already the default for regression
    score_name="R2 score",
    std_display_style="errorbar",
    cv=cv,
    n_jobs=2,
)

_ = disp.ax_.set(
    xlabel="Number of trees",
    title="Validation curve for Random Forest",
)

In [None]:
rf_1_tree = RandomForestRegressor(n_estimators=1, random_state=0)
cv_results_tree = cross_validate(
    rf_1_tree, data, target, cv=10, return_train_score=True
)
cv_results_tree["train_score"]

In [None]:
tree = DecisionTreeRegressor(random_state=0)
cv_results_tree = cross_validate(
    tree, data, target, cv=10, return_train_score=True
)
cv_results_tree["train_score"]

In [None]:
rf_1_tree = RandomForestRegressor(n_estimators=1, bootstrap=False, random_state=0)
cross_validate(rf_1_tree, data, target, cv=cv, return_train_score=True)["train_score"]

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

hgbdt = HistGradientBoostingRegressor(random_state=0)
max_iters = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500])

disp = ValidationCurveDisplay.from_estimator(
    hgbdt,
    data,
    target,
    param_name="max_iter",
    param_range=max_iters,
    scoring="r2",  # note: this is already the default for regression
    score_name="R2 score",
    std_display_style="errorbar",
    cv=cv,
    n_jobs=2,
)

_ = disp.ax_.set(
    xlabel="(Maximum) number of trees",
    title="Validation curve for Histogram GBDT",
)

In [None]:
hgbdt = HistGradientBoostingRegressor(early_stopping=True, random_state=0)
cv_results_hgbdt = cross_validate(
    hgbdt, data, target, cv=cv, return_train_score=True, return_estimator=True
)
cv_results_hgbdt["train_score"].mean(), cv_results_hgbdt["train_score"].std()

In [None]:
cv_results_hgbdt["test_score"].mean(), cv_results_hgbdt["test_score"].std()

In [None]:
for idx, est in enumerate(cv_results_hgbdt["estimator"]):
    print(
        f"For CV iteration {idx + 1}, {est.n_iter_} trees were built"
    )