In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.metrics import mean_absolute_error, confusion_matrix,\
ConfusionMatrixDisplay, classification_report

from sklearn.model_selection import train_test_split,\
cross_validate, cross_val_score, ShuffleSplit, \
RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor

In [None]:
import numpy as np
np.random.seed(306)

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
# fetch dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *=100

# train-test split
com_train_features, test_features, com_train_labels, test_labels = \
    train_test_split(features, labels, random_state=42)

# train --> train + dev split
train_features, dev_features, train_labels, dev_labels = \
    train_test_split(com_train_features, com_train_labels, random_state=42)


In [None]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(estimator,
                               X_train,
                               y_train,
                               cv=cv,
                               scoring="neg_mean_absolute_error",
                               return_train_score=True,
                               return_estimator=True)

    cv_train_error = -1 * cv_results['train_score']
    cv_test_error = -1 * cv_results['test_score']

    print(f"On an average, {name} makes an error of "
          f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")
    print(f"On an average, {name} makes an error of "
          f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set.")

In [None]:
#title Decision Tree Regressor
train_regressor(
    DecisionTreeRegressor(), com_train_features,
    com_train_labels, cv, 'decision tree regressor')

On an average, decision tree regressor makes an error of 0.000k +/- 0.000k on the training set.
On an average, decision tree regressor makes an error of 47.259k +/- 1.142k on the test set.


In [None]:
#title Bagging Regressor
train_regressor(
    BaggingRegressor(), com_train_features,
    com_train_labels, cv, 'bagging regressor')

On an average, bagging regressor makes an error of 14.377k +/- 0.196k on the training set.
On an average, bagging regressor makes an error of 35.217k +/- 0.608k on the test set.


In [None]:
train_regressor(
    RandomForestRegressor(), com_train_features,\
    com_train_labels, cv, 'random forest regressor')

On an average, random forest regressor makes an error of 12.642k +/- 0.071k on the training set.
On an average, random forest regressor makes an error of 33.198k +/- 0.718k on the test set.


In [None]:
param_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2,)

search_cv.fit(com_train_features, com_train_labels)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,mean_test_error,std_test_error
0,500,100,40.62118,0.742372
2,10,100,41.480141,0.795252
7,100,50,43.873942,0.776364
8,1,100,46.39309,1.1133
1,100,20,49.478504,1.074316
6,50,20,49.525263,1.081072
9,10,20,49.548978,1.454742
3,500,10,55.001188,1.060603
4,5,5,61.52684,1.119192
5,5,2,73.257739,1.203809


In [None]:
error = -search_cv.score(test_features, test_labels)
print(f"On average, our random forest regressor makes an error of {error:.2f} k$")

On average, our random forest regressor makes an error of 40.46 k$


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the GridSearchCV object
grid_cv = GridSearchCV(
    RandomForestRegressor(n_jobs=2),
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

# Fit the model with the training data
grid_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_grid.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(grid_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(grid_cv.best_params_)
print("\nBest Mean Test Error:", -grid_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'max_leaf_nodes': 100, 'n_estimators': 500}

Best Mean Test Error: 40.621791692773094

Results Table:
   param_n_estimators param_max_leaf_nodes  mean_test_error  std_test_error
53                500                  100        40.621792        0.741882
52                200                  100        40.634130        0.758320
51                100                  100        40.640434        0.791751
50                 50                  100        40.753253        0.679084
49                 20                  100        40.963110        0.646039
48                 10                  100        41.164154        0.688871
47                  5                  100        41.904222        0.614989
46                  2                  100        43.665310        1.874430
44                500                   50        43.826681        0.834229
40                 20                   50        43.830968        0.683715
42                100                   50  

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the hyperparameter distribution for RandomizedSearchCV
param_dist = {
    "regressor__n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "regressor__max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs=2)

# Create a pipeline with feature selection and regression
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(rf_reg)),
    ('scaler', StandardScaler()),  # You can add other preprocessing steps here
    ('regressor', rf_reg)
])

# Instantiate the RandomizedSearchCV object
randomized_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring="neg_mean_absolute_error",
    n_iter=10,  # Number of parameter settings sampled
    n_jobs=2,
)

# Fit the model with the training data
randomized_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_dist.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(randomized_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(randomized_cv.best_params_)
print("\nBest Mean Test Error:", -randomized_cv.best_score_)
print("\nResults Table:")
print(result_table)



Best Parameters:
{'regressor__n_estimators': 20, 'regressor__max_leaf_nodes': 50}

Best Mean Test Error: 54.16977131724252

Results Table:
  param_regressor__n_estimators param_regressor__max_leaf_nodes  \
7                            20                              50   
8                           100                              20   
4                             2                              20   
2                           200                              10   
6                            20                              10   
1                             1                              10   
9                           200                               5   
5                            20                               5   
3                             2                               5   
0                             1                               2   

   mean_test_error  std_test_error  
7        54.169771        1.048682  
8        54.722009        1.139000  
4        55.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the hyperparameter distribution for RandomizedSearchCV
param_dist = {
    "regressor__n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "regressor__max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs=2)

# Create a pipeline with feature selection and regression
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(rf_reg)),
    ('scaler', StandardScaler()),  # You can add other preprocessing steps here
    ('regressor', rf_reg)
])

# Instantiate the RandomizedSearchCV object
randomized_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring="neg_mean_absolute_error",
    n_iter=10,  # Number of parameter settings sampled
    n_jobs=2,
)

# Fit the model with the training data
randomized_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_dist.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(randomized_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(randomized_cv.best_params_)
print("\nBest Mean Test Error:", -randomized_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'regressor__n_estimators': 100, 'regressor__max_leaf_nodes': 50}

Best Mean Test Error: 54.11154715562445

Results Table:
  param_regressor__n_estimators param_regressor__max_leaf_nodes  \
9                           100                              50   
1                            50                              50   
2                             1                              20   
0                           100                              10   
8                            10                              10   
7                             1                              10   
6                           100                               5   
3                            20                               2   
5                            50                               2   
4                           200                               2   

   mean_test_error  std_test_error  
9        54.111547        1.132657  
1        54.153498        1.108226  
2        56