In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

import joblib

# Load Preprocessed data from .pkl files

In [None]:
# loading thee preprocessed data @sabinvankathmandu
X_train = pd.read_pickle("../data/X_train.pkl")
X_test = pd.read_pickle("../data/X_test.pkl")
y_train = pd.read_pickle("../data/y_train.pkl")
y_test = pd.read_pickle("../data/y_test.pkl")

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Train the linear regression model

In [None]:
# model iniitalised @sabinvankahtmandu
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Print model coefficients @sabinvankahtmandu
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Evaluate the Model

In [None]:
# Predictions on training and test data @sabinvankathmandu
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics @sabinvankathmandu
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Training MSE: {mse_train:.2f}, Training R²: {r2_train:.2f}")
print(f"Test MSE: {mse_test:.2f}, Test R²: {r2_test:.2f}")


# Actual vs Predicted Prices Visualization

In [None]:
# Creating the actual vs predicted price linear plot @sabinvankathmandu
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_test_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--r", lw=2)  # Identity line
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()


# Save the trained model

In [None]:
joblib.dump(model, "../model/linear_regression_model.pkl")
print("Model saved successfully!")

# HyperParameter Tuning

In [None]:
# Code assisted by Chatgpt
# Define range of alpha values to test
alpha_values = np.logspace(-4, 2, 50)  # 50 values from 10^(-4) to 10^(2)

# Ridge Regression
ridge = Ridge()
ridge_params = {'alpha': alpha_values}

# Lasso Regression
lasso = Lasso()
lasso_params = {'alpha': alpha_values}


In [None]:
# perfrom grid search for best hyperparameters
# Ridge Regression with GridSearchCV
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring='r2', n_jobs=-1)
ridge_grid.fit(X_train, y_train)

# Lasso Regression with GridSearchCV
lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, scoring='r2', n_jobs=-1)
lasso_grid.fit(X_train, y_train)

# Best alpha values
print(f"Best Alpha for Ridge: {ridge_grid.best_params_['alpha']}")
print(f"Best Alpha for Lasso: {lasso_grid.best_params_['alpha']}")



In [None]:
# now training the optimized ridge and lasso models
# Ridge with best alpha training
ridge_best = Ridge(alpha=ridge_grid.best_params_['alpha'])
ridge_best.fit(X_train, y_train)

# train lasso with best alpha
lasso_best = Lasso(alpha=lasso_grid.best_params_['alpha'])
lasso_best.fit(X_train, y_train)


In [None]:
# Compare the model perfromances
# Predictions
ridge_pred = ridge_best.predict(X_test)
lasso_pred = lasso_best.predict(X_test)

# Evaluation
ridge_r2 = r2_score(y_test, ridge_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

print(f"Ridge R²: {ridge_r2:.4f}")
print(f"Lasso R²: {lasso_r2:.4f}")

In [None]:
# Saving the best models Ridge and lasso
joblib.dump(ridge_best, "../model/ridge_model.pkl")
joblib.dump(lasso_best, "../model/lasso_model.pkl")

print("Optimized Ridge & Lasso models saved!")
