Use a dataset suitable for Linear Regression (e.g., house prices). Before training, apply
feature scaling or standardization to the independent variables using NumPy and/or
Pandas and then Scikit-learn's preprocessing tools. Implement the Linear Regression
model and Evaluate models using R², RMSE, and MAE. Discuss overfitting and apply
regularization techniques if needed.

1. Imports and Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
housing = fetch_california_housing()

# Create a Pandas DataFrame for easier handling
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name='MedHouseVal')

# Display the first few rows and info
print("--- Initial Data (X) ---")
print(X.head())
print("\n--- Data Info ---")
X.info()

# Split the data into training and testing sets
# We hold out 20% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

--- Initial Data (X) ---
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup 

2. Preprocessing & Model 1: Manual Standardization (NumPy/Pandas)

In [2]:
# --- Manual Standardization (Method 1) ---

# 1. Calculate mean and std from the TRAINING data
train_mean = X_train.mean()
train_std = X_train.std()

print("\n--- Training Set Mean ---")
print(train_mean)

# 2. Apply the transformation to both train and test sets
X_train_manual_scaled = (X_train - train_mean) / train_std
X_test_manual_scaled = (X_test - train_mean) / train_std

print("\n--- Manually Scaled Training Data (Head) ---")
print(X_train_manual_scaled.head())

# --- Train Model 1 ---
print("\n--- Training Model 1 (Manual Scaling) ---")
model_manual = LinearRegression()
model_manual.fit(X_train_manual_scaled, y_train)

# --- Evaluate Model 1 ---
y_pred_manual = model_manual.predict(X_test_manual_scaled)

print("\n--- Model 1 Evaluation (Manual Scaling) ---")
r2_manual = r2_score(y_test, y_pred_manual)
rmse_manual = np.sqrt(mean_squared_error(y_test, y_pred_manual))
mae_manual = mean_absolute_error(y_test, y_pred_manual)

print(f"R-squared (R²): {r2_manual:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_manual:.4f}")
print(f"Mean Absolute Error (MAE): {mae_manual:.4f}")


--- Training Set Mean ---
MedInc           3.880754
HouseAge        28.608285
AveRooms         5.435235
AveBedrms        1.096685
Population    1426.453004
AveOccup         3.096961
Latitude        35.643149
Longitude     -119.582290
dtype: float64

--- Manually Scaled Training Data (Head) ---
         MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  \
14196 -0.326186  0.348480 -0.174911  -0.208359    0.768253  0.051375   
8267  -0.035842  1.618069 -0.402823  -0.128526   -0.098898 -0.117359   
17445  0.144697 -1.952651  0.088213  -0.257530   -0.449804 -0.032279   
14265 -1.017834  0.586528 -0.599997  -0.145152   -0.007434  0.077505   
2271  -0.171483  1.141973  0.348997   0.086622   -0.485862 -0.068830   

       Latitude  Longitude  
14196 -1.372770   1.272548  
8267  -0.876669   0.709141  
17445 -0.460133  -0.447590  
14265 -1.382130   1.232661  
2271   0.532068  -0.108548  

--- Training Model 1 (Manual Scaling) ---

--- Model 1 Evaluation (Manual Scaling) ---
R-squared

3. Preprocessing & Model 2: Scikit-learn StandardScaler

In [3]:
# --- Scikit-learn Standardization (Method 2) ---

# 1. Initialize the scaler
scaler = StandardScaler()

# 2. Fit the scaler ONLY on the training data
scaler.fit(X_train) 

# 3. Transform both the train and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Note: The output is a NumPy array, not a DataFrame
print("\n--- Scikit-learn Scaled Training Data (First 5 Rows) ---")
print(X_train_scaled[:5])

# --- Train Model 2 ---
print("\n--- Training Model 2 (StandardScaler) ---")
model_sklearn = LinearRegression()
model_sklearn.fit(X_train_scaled, y_train)

# --- Evaluate Model 2 ---
y_pred_sklearn = model_sklearn.predict(X_test_scaled)

print("\n--- Model 2 Evaluation (StandardScaler) ---")
r2_sklearn = r2_score(y_test, y_pred_sklearn)
rmse_sklearn = np.sqrt(mean_squared_error(y_test, y_pred_sklearn))
mae_sklearn = mean_absolute_error(y_test, y_pred_sklearn)

print(f"R-squared (R²): {r2_sklearn:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_sklearn:.4f}")
print(f"Mean Absolute Error (MAE): {mae_sklearn:.4f}")


--- Scikit-learn Scaled Training Data (First 5 Rows) ---
[[-0.326196    0.34849025 -0.17491646 -0.20836543  0.76827628  0.05137609
  -1.3728112   1.27258656]
 [-0.03584338  1.61811813 -0.40283542 -0.12853018 -0.09890135 -0.11736222
  -0.87669601  0.70916212]
 [ 0.14470145 -1.95271028  0.08821601 -0.25753771 -0.44981806 -0.03227969
  -0.46014647 -0.44760309]
 [-1.01786438  0.58654547 -0.60001532 -0.14515634 -0.00743434  0.07750687
  -1.38217186  1.23269811]
 [-0.17148831  1.14200767  0.3490073   0.08662432 -0.48587717 -0.06883176
   0.5320839  -0.10855122]]

--- Training Model 2 (StandardScaler) ---

--- Model 2 Evaluation (StandardScaler) ---
R-squared (R²): 0.5758
Root Mean Squared Error (RMSE): 0.7456
Mean Absolute Error (MAE): 0.5332


4. Overfitting and Regularization

In [4]:
# Check for Overfitting
train_score = model_sklearn.score(X_train_scaled, y_train) # R² on training data
test_score = model_sklearn.score(X_test_scaled, y_test)   # R² on testing data

print(f"\n--- Overfitting Check ---")
print(f"Training R²: {train_score:.4f}")
print(f"Testing R²:  {test_score:.4f}")


--- Overfitting Check ---
Training R²: 0.6126
Testing R²:  0.5758


Model 3: Ridge Regression (L2)

In [5]:
print("\n--- Training Model 3 (Ridge Regression, L2) ---")
# alpha controls the strength of the penalty. Higher alpha = stronger regularization.
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)

# --- Evaluate Model 3 ---
y_pred_ridge = ridge_model.predict(X_test_scaled)

print("\n--- Model 3 Evaluation (Ridge) ---")
r2_ridge = r2_score(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

print(f"R-squared (R²): {r2_ridge:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ridge:.4f}")


--- Training Model 3 (Ridge Regression, L2) ---

--- Model 3 Evaluation (Ridge) ---
R-squared (R²): 0.5758
Root Mean Squared Error (RMSE): 0.7456
Mean Absolute Error (MAE): 0.5332


Model 4: Lasso Regression (L1)

In [6]:
print("\n--- Training Model 4 (Lasso Regression, L1) ---")
# We often use a smaller alpha for Lasso
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_scaled, y_train)

# --- Evaluate Model 4 ---
y_pred_lasso = lasso_model.predict(X_test_scaled)

print("\n--- Model 4 Evaluation (Lasso) ---")
r2_lasso = r2_score(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)

print(f"R-squared (R²): {r2_lasso:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lasso:.4f}")
print(f"Mean Absolute Error (MAE): {mae_lasso:.4f}")


--- Training Model 4 (Lasso Regression, L1) ---

--- Model 4 Evaluation (Lasso) ---
R-squared (R²): 0.5816
Root Mean Squared Error (RMSE): 0.7404
Mean Absolute Error (MAE): 0.5353


5. Final Comparison

In [7]:
# --- Final Model Comparison ---

print("\n--- Coefficient Comparison ---")
coef_df = pd.DataFrame({
    'Feature': housing.feature_names,
    'Linear': model_sklearn.coef_,
    'Ridge': ridge_model.coef_,
    'Lasso': lasso_model.coef_
})
print(coef_df)

print("\n--- Metrics Comparison ---")
metrics = {
    'Model': ['Linear', 'Ridge (alpha=1)', 'Lasso (alpha=0.01)'],
    'R²': [r2_sklearn, r2_ridge, r2_lasso],
    'RMSE': [rmse_sklearn, rmse_ridge, rmse_lasso],
    'MAE': [mae_sklearn, mae_ridge, mae_lasso]
}
metrics_df = pd.DataFrame(metrics).set_index('Model')
print(metrics_df.round(4))


--- Coefficient Comparison ---
      Feature    Linear     Ridge     Lasso
0      MedInc  0.854383  0.854327  0.800957
1    HouseAge  0.122546  0.122624  0.127087
2    AveRooms -0.294410 -0.294210 -0.162759
3   AveBedrms  0.339259  0.339008  0.206207
4  Population -0.002308 -0.002282 -0.000000
5    AveOccup -0.040829 -0.040833 -0.030602
6    Latitude -0.896929 -0.896168 -0.790113
7   Longitude -0.869842 -0.869071 -0.755674

--- Metrics Comparison ---
                        R²    RMSE     MAE
Model                                     
Linear              0.5758  0.7456  0.5332
Ridge (alpha=1)     0.5758  0.7456  0.5332
Lasso (alpha=0.01)  0.5816  0.7404  0.5353
