In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


In [22]:
#  Load preprocessed data
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

print("Preprocessed data loaded successfully!")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Preprocessed data loaded successfully!
Training set shape: (404, 13)
Testing set shape: (102, 13)


In [23]:
# Choosing the appropriate features

selected_features = X_train.columns  # Use all features for now
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]
print("\nSelected Features:", list(selected_features))



Selected Features: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [24]:
# Train a Linear Regression model

lr_model = LinearRegression()
lr_model.fit(X_train_selected, y_train)

# Make predictions on training set
y_train_pred = lr_model.predict(X_train_selected)

In [25]:
# Evaluate on training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("\nLinear Regression - Training Performance:")
print(f"Mean Squared Error (MSE): {train_mse:.2f}")
print(f"R^2 Score: {train_r2:.2f}")


Linear Regression - Training Performance:
Mean Squared Error (MSE): 21.64
R^2 Score: 0.75


In [26]:
# Cross-validation to assess generalization
cv_scores = cross_val_score(lr_model, X_train_selected, y_train, cv=5, scoring='r2')
print("\nCross-Validation R^2 Scores:", cv_scores)
print(f"Average CV R^2 Score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")


Cross-Validation R^2 Scores: [0.77501982 0.62424945 0.7594282  0.78766681 0.67581438]
Average CV R^2 Score: 0.72 (+/- 0.13)


In [27]:
# Hyperparameter tuning with Ridge Regression

alphas = [0.01, 0.1, 1.0, 10.0, 100.0]
ridge_scores = []

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_selected, y_train)
    cv_score = cross_val_score(ridge_model, X_train_selected, y_train, cv=5, scoring='r2').mean()
    ridge_scores.append(cv_score)
    print(f"Ridge (alpha={alpha}): CV R^2 Score = {cv_score:.2f}")

Ridge (alpha=0.01): CV R^2 Score = 0.72
Ridge (alpha=0.1): CV R^2 Score = 0.72
Ridge (alpha=1.0): CV R^2 Score = 0.72
Ridge (alpha=10.0): CV R^2 Score = 0.72
Ridge (alpha=100.0): CV R^2 Score = 0.70


In [28]:
# Select best alpha
best_alpha = alphas[np.argmax(ridge_scores)]
print(f"\nBest alpha for Ridge: {best_alpha}")
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train_selected, y_train)


Best alpha for Ridge: 1.0


In [29]:
# Save the trained models (optional)
import joblib
joblib.dump(lr_model, '../models/linear_regression_model.pkl')
joblib.dump(ridge_model, '../models/ridge_model.pkl')
print("Trained models saved to '../models/' directory.")

Trained models saved to '../models/' directory.
