In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


In [None]:
# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [None]:
X.shape

(20640, 8)

In [None]:
# Feature engineering: Add polynomial features
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_transformer.fit_transform(X)


In [None]:
X_poly.shape

(20640, 44)

In [None]:
X_poly

array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00, ...,
         1.43489440e+03, -4.63007240e+03,  1.49401729e+04],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00, ...,
         1.43337960e+03, -4.62724920e+03,  1.49377284e+04],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00, ...,
         1.43262250e+03, -4.62678400e+03,  1.49426176e+04],
       ...,
       [ 1.70000000e+00,  1.70000000e+01,  5.20554273e+00, ...,
         1.55472490e+03, -4.77970460e+03,  1.46942884e+04],
       [ 1.86720000e+00,  1.80000000e+01,  5.32951289e+00, ...,
         1.55472490e+03, -4.78364760e+03,  1.47185424e+04],
       [ 2.38860000e+00,  1.60000000e+01,  5.25471698e+00, ...,
         1.54999690e+03, -4.77321880e+03,  1.46991376e+04]])

In [None]:
X_poly_df = pd.DataFrame(X_poly)

In [None]:
X_poly_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,69.308955,341.3332,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,68.913242,174.3294,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,52.669855,377.3848,...,246016.0,1389.920904,18773.60,-60631.04,7.852660,106.065537,-342.548249,1432.6225,-4626.7840,14942.6176
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,293.4412,...,311364.0,1421.753425,21120.30,-68215.50,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,200.0024,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,2.434536,39.0075,...,714025.0,2163.712121,33360.60,-102321.05,6.556703,101.092727,-310.063788,1558.6704,-4780.6332,14662.7881
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,6.537226,46.0224,...,126736.0,1111.719298,14058.44,-43150.76,9.751924,123.319649,-378.515439,1559.4601,-4786.5829,14691.8641
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,2.890000,28.9000,...,1014049.0,2341.914550,39706.01,-122068.54,5.408579,91.699792,-281.913487,1554.7249,-4779.7046,14694.2884
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,3.486436,33.6096,...,549081.0,1573.297994,29217.63,-89898.12,4.508017,83.718138,-257.587736,1554.7249,-4783.6476,14718.5424


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Instantiate the linear regression model
model = Ridge()

In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {'alpha': np.logspace(-4, 4, 20)}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

In [None]:
# Print the best parameters
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 545.5594781168514}


In [None]:
# Use the best model
best_model = grid_search.best_estimator_

In [None]:
# Make predictions on the testing set
y_pred = best_model.predict(X_test_scaled)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [None]:
# Print the metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r_squared)

Mean Squared Error (MSE): 0.48881623498299953
R-squared: 0.6275799634061007
