# Task 1.1: Load and Prepare the Data

In [176]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [177]:
data = fetch_california_housing()
X = data.data
y = data.target

In [178]:
print(f"Feature Names: {data.feature_names}")
print(f"Target Variable: {data.target_names}")
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Feature Names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Target Variable: ['MedHouseVal']
Shape of X: (20640, 8), Shape of y: (20640,)


In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [180]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [181]:
print(f"Scaled Train Shape: {X_train_scaled.shape}")
print(f"Scaled Test Shape: {X_test_scaled.shape}")

Scaled Train Shape: (16512, 8)
Scaled Test Shape: (4128, 8)


# Task 1.2: Train the Simple Linear Regression Model

In [182]:
from sklearn.linear_model import LinearRegression

In [183]:
lr_model = LinearRegression()

In [184]:
lr_model.fit(X_train_scaled, y_train)

In [185]:
y_pred_lr = lr_model.predict(X_test_scaled)

In [186]:
print(f"Predictions (first 5): {y_pred_lr[:5]}")

Predictions (first 5): [0.71912284 1.76401657 2.70965883 2.83892593 2.60465725]


# Task 1.3: Evaluate the Simple Linear Regression Model

In [187]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [188]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)

In [189]:
mse_lr = mean_squared_error(y_test, y_pred_lr)

In [190]:
rmse_lr = np.sqrt(mse_lr)

In [191]:
r2_lr = r2_score(y_test, y_pred_lr)

In [192]:
print(f"Simple Linear Regression Evaluation:")
print(f"MAE: {mae_lr}")
print(f"MSE: {mse_lr}")
print(f"RMSE: {rmse_lr}")
print(f"R-squared: {r2_lr}")

Simple Linear Regression Evaluation:
MAE: 0.5332001304956566
MSE: 0.5558915986952442
RMSE: 0.7455813830127763
R-squared: 0.575787706032451


R-squared represents the proportion of variance in the target variable (house price) explained by the model.An R-squared value closer to 1 means the model explains most of the variance

# Task 2.1: Implement Ridge Regression

In [193]:
from sklearn.linear_model import Ridge

In [194]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)


In [195]:
y_pred_ridge = ridge_model.predict(X_test_scaled)

In [196]:
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

In [197]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

In [198]:
rmse_ridge = np.sqrt(mse_ridge)

In [199]:
r2_ridge = r2_score(y_test, y_pred_ridge)

In [200]:
print(f"Ridge Regression Evaluation:")
print(f"MAE: {mae_ridge}")
print(f"MSE: {mse_ridge}")
print(f"RMSE: {rmse_ridge}")
print(f"R-squared: {r2_ridge}")

Ridge Regression Evaluation:
MAE: 0.5331931195789733
MSE: 0.5558548589435974
RMSE: 0.7455567442814782
R-squared: 0.575815742891368


In [201]:
print(f"\nR-Squared Comparison (Linear Regression vs Ridge):")
print(f"Simple Linear Regression R-squared: {r2_lr}")
print(f"Ridge Regression R-squared: {r2_ridge}")


R-Squared Comparison (Linear Regression vs Ridge):
Simple Linear Regression R-squared: 0.575787706032451
Ridge Regression R-squared: 0.575815742891368


# Task 2.2: Implement Lasso Regression

In [202]:
from sklearn.linear_model import Lasso

In [203]:
lasso_model = Lasso(alpha=0.01)

In [204]:
lasso_model.fit(X_train_scaled, y_train)

In [205]:
y_pred_lasso = lasso_model.predict(X_test_scaled)

In [206]:
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)


In [207]:
print(f"Lasso Regression Evaluation:")
print(f"MAE: {mae_lasso}")
print(f"MSE: {mse_lasso}")
print(f"RMSE: {rmse_lasso}")
print(f"R-squared: {r2_lasso}")

Lasso Regression Evaluation:
MAE: 0.5353261423609051
MSE: 0.5482548967938964
RMSE: 0.7404423656125414
R-squared: 0.5816154300698727


In [208]:
print(f"\nLasso Coefficients:")
print(lasso_model.coef_)


Lasso Coefficients:
[ 0.80095744  0.12708701 -0.16275931  0.20620745 -0.         -0.03060176
 -0.79011254 -0.75567379]


In [209]:
print("\nComparison of Ridge and Lasso Coefficients:")
print("Ridge: The coefficients are not shrunk to zero.")
print("Lasso: Some coefficients may be shrunk to zero, effectively performing feature selection.")


Comparison of Ridge and Lasso Coefficients:
Ridge: The coefficients are not shrunk to zero.
Lasso: Some coefficients may be shrunk to zero, effectively performing feature selection.


# Task 3: Final Comparison and Conclusion

In [210]:
comparison_table = pd.DataFrame({
    'Model': ['Simple Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'R2 Score': [r2_lr, r2_ridge, r2_lasso],
    'RMSE': [rmse_lr, rmse_ridge, rmse_lasso],
    'MAE': [mae_lr, mae_ridge, mae_lasso]
})

In [211]:
print("Model Comparison Summary:")
print(comparison_table)

Model Comparison Summary:
                      Model  R2 Score      RMSE       MAE
0  Simple Linear Regression  0.575788  0.745581  0.533200
1          Ridge Regression  0.575816  0.745557  0.533193
2          Lasso Regression  0.581615  0.740442  0.535326


In [212]:
print("\nBest Model Analysis:")
if r2_lr > r2_ridge and r2_lr > r2_lasso:
    print("Simple Linear Regression performs the best based on R-squared value.")
elif r2_ridge > r2_lr and r2_ridge > r2_lasso:
    print("Ridge Regression performs the best based on R-squared value.")
else:
    print("Lasso Regression performs the best based on R-squared value.")


Best Model Analysis:
Lasso Regression performs the best based on R-squared value.


In [213]:
print("\nRole of Regularization (Ridge and Lasso) in Preventing Overfitting:")
print("Both Ridge and Lasso regularization techniques help prevent overfitting by adding a penalty to the model coefficients.")
print("Ridge uses L2 regularization to shrink the coefficients, while Lasso uses L1 regularization, which can shrink some coefficients to zero, performing feature selection.")


Role of Regularization (Ridge and Lasso) in Preventing Overfitting:
Both Ridge and Lasso regularization techniques help prevent overfitting by adding a penalty to the model coefficients.
Ridge uses L2 regularization to shrink the coefficients, while Lasso uses L1 regularization, which can shrink some coefficients to zero, performing feature selection.
