<a href="https://colab.research.google.com/github/Sans-codes/2462364_SanskritiAryal/blob/main/Worksheet7_SanskritiAryal_5CS037_ConceptsAndTechnologiesOfAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [None]:
import pandas as pd
import numpy as np
import os
import kagglehub

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

Load Dataset using kagglehub

In [None]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("camnugent/california-housing-prices")
print("Dataset path:", path)

# Load CSV
housing_data_path = os.path.join(path, "housing.csv")
df = pd.read_csv(housing_data_path)

df.head()

Using Colab cache for faster access to the 'california-housing-prices' dataset.
Dataset path: /kaggle/input/california-housing-prices


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


ONE-HOT ENCODE CATEGORICAL COLUMN

In [None]:
# Convert categorical column to numeric
df_encoded = pd.get_dummies(df, columns=["ocean_proximity"])

df_encoded.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False


Features and Target

In [None]:
# Separate features and target
X = df_encoded.drop("median_house_value", axis=1)
y = df_encoded["median_house_value"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (20640, 13)
y shape: (20640,)


Train–Test Split (80% / 20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training size:", X_train.shape)
print("Test size:", X_test.shape)

Training size: (16512, 13)
Test size: (4128, 13)


Part 1: Regression Task

Baseline Linear Regression (No Regularization)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predictions
y_train_pred = linear_model.predict(X_train)
y_test_pred = linear_model.predict(X_test)

print("Baseline Linear Regression")
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))

In [None]:
print("Linear Regression Coefficients:")
print(linear_model.coef_)

RIDGE REGRESSION (L2) WITH GRIDSEARCHCV

In [None]:
ridge = Ridge()

ridge_params = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

ridge_grid = GridSearchCV(
    ridge,
    ridge_params,
    cv=5,
    scoring="neg_mean_squared_error"
)

ridge_grid.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_grid.best_params_)

In [None]:
ridge_best = ridge_grid.best_estimator_

y_train_pred = ridge_best.predict(X_train)
y_test_pred = ridge_best.predict(X_test)

print("Ridge Regression")
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))

LASSO REGRESSION (L1) WITH GRIDSEARCHCV

In [None]:
lasso = Lasso(max_iter=5000)

lasso_params = {
    "alpha": [0.001, 0.01, 0.1, 1, 10]
}

lasso_grid = GridSearchCV(
    lasso,
    lasso_params,
    cv=5,
    scoring="neg_mean_squared_error"
)

lasso_grid.fit(X_train, y_train)

print("Best Lasso alpha:", lasso_grid.best_params_)

In [None]:
lasso_best = lasso_grid.best_estimator_

y_train_pred = lasso_best.predict(X_train)
y_test_pred = lasso_best.predict(X_test)

print("Lasso Regression")
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))

L1 vs L2 COEFFICIENT COMPARISON

In [None]:
print("Ridge Coefficients (L2):")
print(ridge_best.coef_)

print("\nLasso Coefficients (L1 – sparse):")
print(lasso_best.coef_)

Conclusion:   
Linear Regression serves as a baseline without regularization.

Ridge (L2) reduces overfitting by shrinking coefficients but keeps all features.

Lasso (L1) performs feature selection by setting some coefficients to zero.

Regularization reduces variance and improves generalization.

Excessive regularization can increase bias and cause underfitting.