# AML Random Forest

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV

# Load the CSV files into dataframes
X_train = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_X_train.csv')
y_train = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_y_train.csv')
X_val = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_X_val.csv')
y_val = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_y_val.csv')
X_test = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_X_test.csv')
y_test = pd.read_csv('/Users/aliciayee/Library/Mobile Documents/com~apple~CloudDocs/CS610 Applied Machine Learning/Group Project/sg_used_cars_y_test.csv')

# Ensure that y values are in the correct shape
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()
y_test = y_test.values.ravel()

# Parameter grid for RandomForestRegressor
rf_param_grid = {
    'n_estimators': [200, 250, 300],
    'max_features': [0.5, 0.6, 0.8],
    'max_depth': [15, 20],
    'min_samples_split': [1, 2],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False],
    'random_state': [2024]
}

# GridSearchCV for RandomForestRegressor
rf_grid_search = GridSearchCV(estimator=RandomForestRegressor(),
                              param_grid=rf_param_grid,
                              scoring='neg_root_mean_squared_error',
                              cv=3,
                              verbose=1,
                              n_jobs=-1)

# Fit GridSearchCV
rf_grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
best_rf_params = rf_grid_search.best_params_
print(f"Best parameters for RandomForestRegressor: {best_rf_params}")

# Function to train and evaluate the model
def train_and_evaluate(params, X_train, y_train, X_val, y_val, X_test, y_test):
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=2024)
    cv_scores = []
    for train_index, val_index in kf.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
        
        model_cv = RandomForestRegressor(**params)
        model_cv.fit(X_train_cv, y_train_cv)
        
        y_val_pred = model_cv.predict(X_val_cv)
        cv_scores.append(r2_score(y_val_cv, y_val_pred))
    
    print(f'Cross-Validation R^2: {np.mean(cv_scores)}')
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate R^2, RMSE, and MAE
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate Adjusted R^2
    n = len(y_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - (1-r2) * (n-1) / (n-p-1)
    
    print(f'R^2: {r2}')
    print(f'Adjusted R^2: {adjusted_r2}')
    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')

# Train and evaluate with the best parameters from GridSearchCV
print("Training and evaluating with best parameters from GridSearchCV:")
train_and_evaluate(best_rf_params, X_train, y_train, X_val, y_val, X_test, y_test)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


216 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
216 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/aliciayee/Library/Python/3.12/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/aliciayee/Library/Python/3.12/lib/python/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/aliciayee/Library/Python/3.12/lib/python/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/aliciayee/Library/Python/3.12/lib/python/site-packages/sklearn/utils/_param_valida

Best parameters for RandomForestRegressor: {'bootstrap': False, 'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250, 'random_state': 2024}
Training and evaluating with best parameters from GridSearchCV:
Cross-Validation R^2: 0.9651283794103417
R^2: 0.9591649476866027
Adjusted R^2: 0.9585973014337665
RMSE: 39011.27921272259
MAE: 11189.203737361644
