### 3. Model Building and Training

#### Task 3: Model Training

Notebook: notebooks/Model_Training.ipynb
Steps:

- Choose appropriate features for the model.
- Train a linear regression model.
- Perform hyperparameter tuning (if applicable).

- Script: scripts/train_model.py


In [2]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Step 2: Load Preprocessed Data
file_path = "../data/processed_boston_housing.csv"  # Path to preprocessed dataset
data = pd.read_csv(file_path)

# Define target column and features
target_column = 'medv'  # Target variable
features = [col for col in data.columns if col != target_column]

# Step 3: Train-Test Split
X = data[features]
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-Test Split Complete")
print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")

# Step 4: Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate model performance on training and testing sets
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
r2 = r2_score(y_test, test_predictions)

print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)
print(f"Training RMSE: {train_rmse:.2f}")
print(f"Testing RMSE: {test_rmse:.2f}")
print(f"R-squared: {r2:.2f}")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print first five predictions
print("Predicted Prices:", y_pred[:5])

# Step 5: Hyperparameter Tuning (if applicable)
# Linear regression has no major hyperparameters, but if using Ridge or Lasso regression:
from sklearn.linear_model import Ridge
ridge_model = Ridge()
param_grid = {'alpha': [0.1, 1.0, 10.0]}  # Example parameter grid
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

# Save the model
joblib.dump(grid_search.best_estimator_, "../models/linear_model.pkl")
print("Model Saved Successfully")


Train-Test Split Complete
Training Set Size: (170, 13)
Testing Set Size: (43, 13)
Model Coefficients: [ 1.53057136e-01  3.06168521e-01 -1.06016681e-01  8.88178420e-16
 -3.64374162e-01  1.94145844e+00 -1.30570677e+00 -1.60895754e+00
  7.48661330e-01 -8.66184053e-01 -9.63797424e-01 -8.11177953e-02
 -3.89674639e-01]
Model Intercept: 22.110035725827643
Training RMSE: 2.12
Testing RMSE: 2.22
R-squared: 0.68
Predicted Prices: [18.83185683 17.24406412 22.99501443 19.53202966 25.40250153]
Best Hyperparameters: {'alpha': 10.0}
Model Saved Successfully
