In [9]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [10]:
#loading dataset
df=pd.read_csv("boston.csv").dropna()
df.head()
#split dataset
X=df.iloc[:,:-1].values
y=df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=88)
X.shape

(394, 13)

In [11]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)

In [26]:
from sklearn.model_selection import RandomizedSearchCV
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}


In [27]:
random_search = RandomizedSearchCV(
    rf,     # Your RandomForestRegressor
    param_distributions=param_grid,
    n_iter=100,       # Number of random combinations to try
#    scoring='neg_mean_squared_error',  # Specify the appropriate scoring metric
    cv=3,             # Number of cross-validation folds
    n_jobs=-1,        # Use all available CPU cores for parallel processing
    verbose=1,        # Set verbosity level
    random_state=88   # Set a random seed for reproducibility
)

In [28]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [30]:
best_params = random_search.best_params_
best_params

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 30,
 'bootstrap': True}

In [32]:
# Create Bagging Regressor with Best params and fit with training data
best_rf = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap'],
    oob_score=True
)

best_rf.fit(X_train,y_train)
# Predict on Train and Test Data
y_pred_train = best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)
# Compute Train and Test and OOS Error
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
OOS_error = best_rf.oob_score_
print("Training error is :",train_MSE)
print("Validation error is :",test_MSE)
print("Out of bag error:",OOS_error)

Training error is : 1.5828631238095223
Validation error is : 17.713704746835447
Out of sample error: 0.8563939562406176


The OOB score is typically a value between 0 and 1, and it indicates the proportion of correctly predicted instances in the out-of-bag samples. Higher OOB scores, closer to 1, suggest that your model is doing a good job of generalizing from the training data to unseen data. Conversely, lower OOB scores indicate that the model is not generalizing as well.