### 3. Model Building and Training

#### Task 3: Model Training

**Loading processed dataset and selecting top features**

In [2]:
# Load processed data
import joblib
data = joblib.load("../data/processed_data.pkl")

X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']

# Select top features from EDA wit two categorical feature (RM, LSTAT most correlated with MEDV)
selected_features = ['RM', 'LSTAT', 'PTRATIO', 'INDUS', 'TAX', 'RAD_4.0', 'CHAS']
X_train = X_train[selected_features]
X_test = X_test[selected_features]

print("Selected Features:", selected_features)

Selected Features: ['RM', 'LSTAT', 'PTRATIO', 'INDUS', 'TAX', 'RAD_4.0', 'CHAS']


**Training a Linear Regression Model**

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score

# Initialize and train
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate
y_pred = lr.predict(X_test)
print("Linear Regression Results:")
print(f"- RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"- R²: {r2_score(y_test, y_pred):.2f}")

Linear Regression Results:
- RMSE: 3.72
- R²: 0.72


**Hyperparameter Tuning (Ridge Regression)**

In [12]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error
import numpy as np

# Define the parameter grid for hyperparameter tuning
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Initialize the Ridge regression model
ridge_model = Ridge()

# Set up GridSearchCV with cross-validation
ridge_grid_search = GridSearchCV(
    estimator=ridge_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_root_mean_squared_error',  # Scoring metric
    return_train_score=True  # Optionally return training scores
)

# Fit the model with cross-validation
ridge_grid_search.fit(X_train, y_train)

# Retrieve the best model and its parameters
best_ridge_model = ridge_grid_search.best_estimator_
best_alpha = ridge_grid_search.best_params_['alpha']

# Predict on the test set using the best model
y_pred_ridge = best_ridge_model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred_ridge)

# Print the results
print("\nRidge Regression Results:")
print(f"- Best alpha: {best_alpha}")
print(f"- RMSE: {rmse:.2f}")


Ridge Regression Results:
- Best alpha: 10
- RMSE: 3.71


**Hyperparameter Tuning (RandomForestRegressor)**

In [13]:
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]  # Minimum number of samples required at each leaf node
}

# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor(random_state=42)

# Set up GridSearchCV with cross-validation
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_root_mean_squared_error',  # Scoring metric
)

# Fit the model with cross-validation
rf_grid_search.fit(X_train, y_train)

# Retrieve the best model and its parameters
best_rf_model = rf_grid_search.best_estimator_
best_params = rf_grid_search.best_params_

# Predict on the test set using the best model
y_pred_rf = best_rf_model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred_rf)

# Print the results
print("\nRandom Forest Regression Results:")
print(f"- Best parameters: {best_params}")
print(f"- RMSE: {rmse:.2f}")


Random Forest Regression Results:
- Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
- RMSE: 3.18


**Hyperparameter Tuning (Gradiant Boosting)**

In [9]:
from xgboost import XGBRegressor

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth' : [3, 6],
    'gamma': [0, 0.1] # Added to control overfitting
}

# Initialize the RandomForestRegressor model
xgb_model = XGBRegressor(random_state=42, 
                         enable_categorical=True,
                         early_stopping_rounds = 10,
                         eval_metric = 'rmse')

# Set up GridSearchCV with cross-validation
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_root_mean_squared_error',  # Scoring metric
    n_jobs = -1 # Parallelize computation
)

# Fit the model with cross-validation
xgb_grid_search.fit(X_train, y_train,
                    eval_set = [(X_test, y_test)], # Valiation data for early stopping
                    verbose = False
                    )

# Retrieve the best model and its parameters
best_xgb_model = xgb_grid_search.best_estimator_
best_params = xgb_grid_search.best_params_

# Predict on the test set using the best model
y_pred_xgb = best_xgb_model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred_xgb)

# Print the results
print("\nRandom Forest Regression Results:")
print(f"- Best parameters: {best_params}")
print(f"- RMSE: {rmse:.2f}")


Random Forest Regression Results:
- Best parameters: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
- RMSE: 3.03


**Comparing Hyperparameter tuning models together**

In [10]:
import pandas as pd

models = {
    'Ridge': ridge_grid_search,
    'RandomForest': rf_grid_search,
    'XGBoost': xgb_grid_search
}

results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    results.append({
        'Model': name,
        'RMSE': root_mean_squared_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred),
        'Best Params': model.best_params_
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values('RMSE'))

          Model      RMSE        R²  \
2       XGBoost  3.032855  0.811994   
1  RandomForest  3.180640  0.793225   
0         Ridge  3.711583  0.718429   

                                         Best Params  
2  {'gamma': 0, 'learning_rate': 0.1, 'max_depth'...  
1  {'max_depth': 10, 'min_samples_leaf': 2, 'min_...  
0                                      {'alpha': 10}  


**Saving the best model**

In [11]:
# Save the best performing model
joblib.dump(xgb_grid_search, "../models/best_model.pkl")
print("\nSaved best model to ../models/best_model.pkl")


Saved best model to ../models/best_model.pkl
