In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Load the dataset
df = pd.read_csv('../data/clean/combined_dataset.csv')

In [4]:
# Define features (X) and target (y)
X = df[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'AveOccup']]
y = df['Price']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

## Use GridSearchCV for the Gradient Boosting Regressor hyperparameter tuning

In [8]:
# Initialize GridSearchCV
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

#Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and evaluate the model
best_params = grid_search.best_params_
print(f"Best Parameters for Gradient Boosting: {best_params}")

Best Parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150}


## Train and Evaluate the Final Model with Best Parameters

In [10]:
# Initialize the Gradient Boosting model with the best parameters
final_model = GradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=5,
    n_estimators=200,
    random_state=42
)

# Train the model using the entire training set
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_final = final_model.predict(X_test)

# Evaluate the final model
mae_final = mean_absolute_error(y_test, y_pred_final)
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_final))
r2_final = r2_score(y_test, y_pred_final)

# Print out the evaluation metrics
print(f"Final Model Evaluation - MAE: {mae_final}, RMSE: {rmse_final}, R2: {r2_final}")


Final Model Evaluation - MAE: 47545.3139190703, RMSE: 82990.24361340384, R2: 0.6104348665027471
