### 3. Model Building and Training
#### Task 3: Model Training

Notebook: notebooks/Model_Training.ipynb
Steps:
- Choose appropriate features for the model.
- Train a linear regression model.
- Perform hyperparameter tuning (if applicable).

- Script: scripts/train_model.py


In [33]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [34]:
###Load the Preprocessed Dataset
def read_file(filename):
    filepath = '../Data/'+str(filename)
    return pd.read_csv(filepath)

X_test = read_file('X_test.csv')
y_test = read_file('y_test.csv')
X_train = read_file('X_train.csv')
y_train = read_file('y_train.csv')

print(len(X_test), len(X_train), len(y_test), len(y_train))

51 201 51 201


### Train a Linear Regression Model

In [37]:
# Initialize the model
model = LinearRegression()

# Train (fit) the model
model.fit(X_train, y_train)

# Print model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)


Model Coefficients: [[ 0.17061797  0.17041086 -0.00658663 -0.23182495  2.47266374 -1.13655857
  -1.0625756   0.46516574 -0.74433858 -0.93069212  0.13205168 -0.67872902
   0.47862257]]
Model Intercept: [22.67517414]


### Make predictions

In [38]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print first five predictions
print("Predicted Prices:", y_pred[:5])


Predicted Prices: [[22.45519388]
 [19.3892715 ]
 [15.39708857]
 [32.18473759]
 [19.01763406]]


### Evaluate Model Performance

In [39]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²) Score
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


Mean Squared Error: 5.044789881350664
R-squared Score: 0.7648975829968503


Hyperparameter Tuning (Random Forest Rgressor)

In [40]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of trees
    "max_depth": [None, 10, 20, 30],  # Maximum depth of trees
    "min_samples_split": [2, 5, 10],  # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4]  # Minimum samples at leaf node
}

# Initialize GridSearchCV
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring="neg_mean_squared_error",
    n_jobs=-1,  # Use all CPU cores
    verbose=2
)

# Fit GridSearchCV
rf_grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", rf_grid_search.best_params_)
print("Best Score:", np.sqrt(-rf_grid_search.best_score_))  # Convert to RMSE

# Evaluate on test set
best_rf_model = rf_grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Test Set RMSE:", rmse)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 2.5609500786250043
Test Set RMSE: 2.1818048668701993


### Hyperparameter Tuning (gradient boosting)

In [42]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the Gradient Boosting model
gb = XGBRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    "n_estimators": [100, 200, 300],  # Number of trees
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    "max_depth": [3, 4, 5, 6],  # Maximum depth of trees
    "min_samples_split": [2, 5, 10],  # Minimum samples required to split a node
    "min_samples_leaf": [1, 3, 5],  # Minimum samples in a leaf node
    "subsample": [0.7, 0.8, 0.9, 1.0]  # Fraction of samples used per tree
}

# Using GridSearchCV for exhaustive search
gb_grid_search = GridSearchCV(gb, param_grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

# Get the best model and hyperparameters
best_gb = gb_grid_search.best_estimator_
best_params = gb_grid_search.best_params_

# Evaluate the best model
y_pred_gb = best_gb.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print Results
print(f"Best Hyperparameters: {best_params}")
print(f"MSE (Gradient Boosting): {mse_gb}")
print(f"R² Score (Gradient Boosting): {r2_gb}")


Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.9}
MSE (Gradient Boosting): 5.1362818132835315
R² Score (Gradient Boosting): 0.7606337666511536


Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### Hyperparameter Tuning (Ridge Rigresson)

In [46]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the Ridge Regression model
ridge = Ridge()

# Define hyperparameter grid for tuning (alpha values)
param_grid = {"alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Using GridSearchCV for exhaustive search
ridge_grid_search = GridSearchCV(ridge, param_grid, scoring="neg_mean_squared_error", cv=5)
ridge_grid_search.fit(X_train, y_train)

# Best parameters and model
best_ridge = ridge_grid_search.best_estimator_
best_alpha = ridge_grid_search.best_params_["alpha"]

# Evaluate the best model
y_pred_ridge = best_ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print Results
print(f"Best Alpha: {best_alpha}")
print(f"MSE (Ridge): {mse_ridge}")
print(f"R² Score (Ridge): {r2_ridge}")


Best Alpha: 10
MSE (Ridge): 5.124147378088594
R² Score (Ridge): 0.7611992844097519


###Hyperparameter Tuning(Gradient Boosting Regressor)

In [49]:

models = {
    'Ridge': ridge_grid_search,
    'RandomForest': rf_grid_search,
    'GradientBoost': gb_grid_search
}

results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    results.append({
        'Model': name,
        'MSE': mean_squared_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred),
        'Best Params': model.best_params_
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values('MSE'))

           Model       MSE        R²  \
1   RandomForest  4.760272  0.778157   
0          Ridge  5.124147  0.761199   
2  GradientBoost  5.136282  0.760634   

                                         Best Params  
1  {'max_depth': None, 'min_samples_leaf': 2, 'mi...  
0                                      {'alpha': 10}  
2  {'learning_rate': 0.2, 'max_depth': 3, 'min_sa...  


In [55]:
import joblib 
# Save the best performing model
joblib.dump(rf_grid_search, "../data/best_model.pkl")
print("\nSaved best model to ../data/best_model.pkl")



Saved best model to ../data/best_model.pkl
