In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Bagging and Random Forest Regressor on California Housing Dataset

## Imports

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.metrics import mean_absolute_error, confusion_matrix,\
ConfusionMatrixDisplay, classification_report

from sklearn.model_selection import train_test_split,\
cross_validate, cross_val_score, ShuffleSplit, \
RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor

In [3]:
np.random.seed(306) 

Let's use `ShuffleSplit` as cv with 10 splits and 20% e.g. set aside as test examples.

In [4]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

Let's download the data and split it into training and test sets.

In [5]:
# fetch dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *=100

# train-test split
com_train_features, test_features, com_train_labels, test_labels = \
    train_test_split(features, labels, random_state=42)

# train --> train + dev split
train_features, dev_features, train_labels, dev_labels = \
    train_test_split(com_train_features, com_train_labels, random_state=42)


## Training different regressors

Let's train different regressors:

In [6]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(estimator,
                               X_train,
                               y_train,
                               cv=cv,
                               scoring="neg_mean_absolute_error",
                               return_train_score=True,
                               return_estimator=True)
    
    cv_train_error = -1 * cv_results['train_score']
    cv_test_error = -1 * cv_results['test_score']
    
    print(f"On an average, {name} makes an error of "
          f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")
    print(f"On an average, {name} makes an error of "
          f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set.")

In [7]:
#title Decision Tree Regressor
train_regressor(
    DecisionTreeRegressor(), com_train_features,
    com_train_labels, cv, 'decision tree regressor')

On an average, decision tree regressor makes an error of 0.000k +/- 0.000k on the training set.
On an average, decision tree regressor makes an error of 47.259k +/- 1.142k on the test set.


In [8]:
#title Bagging Regressor
train_regressor(
    BaggingRegressor(), com_train_features,
    com_train_labels, cv, 'bagging regressor')

On an average, bagging regressor makes an error of 14.377k +/- 0.196k on the training set.
On an average, bagging regressor makes an error of 35.217k +/- 0.608k on the test set.


### RandomForest regressor

In [9]:
train_regressor(
    RandomForestRegressor(), com_train_features,\
    com_train_labels, cv, 'random forest regressor')

On an average, random forest regressor makes an error of 12.642k +/- 0.071k on the training set.
On an average, random forest regressor makes an error of 33.198k +/- 0.717k on the test set.


## Parameter search for random forest regressor

In [10]:
param_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2,)

search_cv.fit(com_train_features, com_train_labels)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,mean_test_error,std_test_error
0,500,100,40.641923,0.733708
2,10,100,41.081103,0.92107
7,100,50,43.872041,0.802726
8,1,100,45.717665,1.180473
6,50,20,49.465011,1.167198
1,100,20,49.480914,1.021785
9,10,20,50.056112,1.445609
3,500,10,55.022199,1.076063
4,5,5,61.822161,1.052154
5,5,2,73.288226,1.257658


In [11]:
error = -search_cv.score(test_features, test_labels)
print(f"On average, our random forest regressor makes an error of {error:.2f} k$")

On average, our random forest regressor makes an error of 40.46 k$


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the GridSearchCV object
grid_cv = GridSearchCV(
    RandomForestRegressor(n_jobs=2),
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

# Fit the model with the training data
grid_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_grid.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(grid_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(grid_cv.best_params_)
print("\nBest Mean Test Error:", -grid_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'max_leaf_nodes': 100, 'n_estimators': 100}

Best Mean Test Error: 40.55061229196913

Results Table:
   param_n_estimators param_max_leaf_nodes  mean_test_error  std_test_error
51                100                  100        40.550612        0.756845
53                500                  100        40.610522        0.693956
52                200                  100        40.613473        0.802481
50                 50                  100        40.722904        0.728099
49                 20                  100        41.035598        0.882256
48                 10                  100        41.446081        0.773629
47                  5                  100        41.921071        0.773260
46                  2                  100        43.729194        0.683593
44                500                   50        43.773841        0.832582
41                 50                   50        43.796824        0.819423
42                100                   50   

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the hyperparameter distribution for RandomizedSearchCV
param_dist = {
    "regressor__n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "regressor__max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs=2)

# Create a pipeline with feature selection and regression
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(rf_reg)),
    ('scaler', StandardScaler()),  # You can add other preprocessing steps here
    ('regressor', rf_reg)
])

# Instantiate the RandomizedSearchCV object
randomized_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring="neg_mean_absolute_error",
    n_iter=10,  # Number of parameter settings sampled
    n_jobs=2,
)

# Fit the model with the training data
randomized_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_dist.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(randomized_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(randomized_cv.best_params_)
print("\nBest Mean Test Error:", -randomized_cv.best_score_)
print("\nResults Table:")
print(result_table)



Best Parameters:
{'regressor__n_estimators': 50, 'regressor__max_leaf_nodes': 100}

Best Mean Test Error: 54.1448868729796

Results Table:
  param_regressor__n_estimators param_regressor__max_leaf_nodes  \
4                            50                             100   
8                           200                              20   
6                             2                              50   
2                             2                              20   
0                           500                              10   
7                            50                              10   
3                            10                              10   
9                            50                               5   
5                            50                               2   
1                             5                               2   

   mean_test_error  std_test_error  
4        54.144887        1.103991  
8        54.705657        1.145223  
6        54.

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the hyperparameter distribution for RandomizedSearchCV
param_dist = {
    "regressor__n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "regressor__max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs=2)

# Create a pipeline with feature selection and regression
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(rf_reg)),
    ('scaler', StandardScaler()),  # You can add other preprocessing steps here
    ('regressor', rf_reg)
])

# Instantiate the RandomizedSearchCV object
randomized_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring="neg_mean_absolute_error",
    n_iter=10,  # Number of parameter settings sampled
    n_jobs=2,
)

# Fit the model with the training data
randomized_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_dist.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(randomized_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(randomized_cv.best_params_)
print("\nBest Mean Test Error:", -randomized_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'regressor__n_estimators': 10, 'regressor__max_leaf_nodes': 100}

Best Mean Test Error: 54.28954231618407

Results Table:
  param_regressor__n_estimators param_regressor__max_leaf_nodes  \
6                            10                             100   
1                           100                              10   
7                            20                              10   
3                             1                              10   
9                            50                               5   
8                             2                               5   
4                            10                               2   
2                            50                               2   
5                           500                               2   
0                             5                               2   

   mean_test_error  std_test_error  
6        54.289542        1.127776  
1        56.894332        1.054914  
7        57