# Random Forest

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import export_graphviz
from sklearn.datasets import load_breast_cancer,load_iris,load_wine
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pydot
%matplotlib inline

In [2]:
#load data
df = pd.read_csv('AmesHousing.csv')

# Separate features and target (SalePrice is the target variable)
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,  # Features (already separated from target)
    y,  # Target variable (SalePrice)
    test_size=0.33,  # 33% of data for testing
    random_state=42  # Setting random seed for reproducibility
)

## Grid Search with CV
Defining paramter grid for GridSearchCV and setting up RandomForestClassifier.

In [None]:
# Identify categorical and numerical columns (assuming X_train is your training data)
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical features: {categorical_cols}")
print(f"Numerical features: {numerical_cols}")

# Create preprocessing pipelines for both data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)  # One-hot encode categoricals
    ])

# Now proceed with your grid search
param_grid = {
    'max_depth': np.arange(1,11,3),
    'n_estimators': [5,15,20,30,40,50,75,100],
    'max_features': np.arange(5,15,5)
}

# Create the Random Forest regressor with OOB
rf = RandomForestRegressor(
    bootstrap=True,
    oob_score=True,
    random_state=42
)

# Create pipeline that includes preprocessing
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', rf)
])

# Update parameter grid for pipeline (add 'regressor__' prefix)
pipeline_param_grid = {
    f'regressor__{key}': value for key, value in param_grid.items()
}

grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=pipeline_param_grid,
    cv=4,
    n_jobs=-1,
    verbose=1,
    refit=True
)

# Fit with raw data - pipeline will handle encoding automatically
print("Starting grid search with preprocessing...")
grid_search.fit(X_train, y_train)

print('\n' + '='*60)
print('GRID SEARCH RESULTS')
print('='*60)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameter setting in the grid:

In [12]:
print(grid_search.best_params_)

{'regressor__max_depth': np.int64(10), 'regressor__max_features': np.int64(10), 'regressor__n_estimators': 75}
