<a href="https://colab.research.google.com/github/Rayers-Ranjitkar/CI_CD_Demo/blob/main/Worksheet_7_Regularization_Rayers_Ranjitkar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [117]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Dataset/housing.csv')
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


# **Task 1**

In [118]:
#Splitting the model

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# X.head()
X = pd.get_dummies(X, drop_first=True) #one-hot-encoding to make ocean_promixity column values a number

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Baseline Model:

In [119]:
# Baseline Linear Regression (no regularization)

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

baseline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # imputer fills NaNs with column mean as LinearRegression cannot handle NaNs
    ("model", LinearRegression())  # Trains a linear regression model on the data AFTER NaNs are filled
])

baseline.fit(X_train, y_train) # Fits the model using the baseline that has the pipeline

train_pred = baseline.predict(X_train)
test_pred  = baseline.predict(X_test)

train_mse = mean_squared_error(y_train, train_pred)
test_mse  = mean_squared_error(y_test, test_pred)

print("Baseline Linear Regression (with imputer)")
print("Train MSE:", train_mse)
print("Test  MSE:", test_mse)

# coefficients come from the model inside the pipeline
print("Coefficients:", baseline.named_steps["model"].coef_)


# Interpretation of MSE:
# The training MSE is slightly lower than the test MSE.
# This indicates that the model performs better on the data it was trained on
# compared to unseen data suggesting that the model may be overfitting.
# Moreover, The relatively large coefficients suggests that the model is sensitive to certain features and higher test MSE indicate that the model may overfit the training data.

Baseline Linear Regression (with imputer)
Train MSE: 4683203783.504253
Test  MSE: 4904409297.414918
Coefficients: [-2.68382734e+04 -2.54683520e+04  1.10218508e+03 -6.02150567e+00
  1.02789395e+02 -3.81729064e+01  4.82527528e+01  3.94739752e+04
 -3.97866562e+04  1.36125073e+05 -5.13664222e+03  3.43114007e+03]


Hyper-Paramter Tuning and Regularization:


In [120]:
# Step 2: Hyperparameter Tuning

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Defining a grid of alpha values
# alpha controls how strong regularization is:
alphas = np.logspace(-4, 2, 20)  # 10^-4 to 10^2(20 values)

# RIDGE (L2 regularization)

ridge_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),   # fills NaNs
    ("scaler", StandardScaler()),                  # scales features for fair regularization
    ("model", Ridge())
])

ridge_grid = GridSearchCV(
    ridge_pipe,
    param_grid={"model__alpha": alphas}, # - tries all alpha values and selects alpha that gives the best average CV performance
    cv=5, #5-folds cross validation
    scoring="neg_mean_squared_error"  # the best model is the one with the highest (least negative) neg-MSE, which corresponds to the lowest MSE.
)

ridge_grid.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_grid.best_params_["model__alpha"])
print("Best Ridge CV neg-MSE:", ridge_grid.best_score_)

# Evaluate best Ridge model on test set
best_ridge = ridge_grid.best_estimator_

ridge_train_mse = mean_squared_error(y_train, best_ridge.predict(X_train))
ridge_test_mse  = mean_squared_error(y_test,  best_ridge.predict(X_test))

print("\nRidge Results")
print("Train MSE:", ridge_train_mse)
print("Test  MSE:", ridge_test_mse)

ridge_coef = best_ridge.named_steps["model"].coef_
print("First 10 Ridge coefficients:", ridge_coef[:10])
# L2 has shrinked coefficients but rarely makes them exactly 0. It does not perform feature selectio and All features remain in the model.
# Positive coefficient → feature increases prediction

Best Ridge alpha: 11.288378916846883
Best Ridge CV neg-MSE: -4710741160.328094

Ridge Results
Train MSE: 4683326738.187337
Test  MSE: 4900076762.299355
First 10 Ridge coefficients: [-52583.18569496 -53124.74647825  13896.27181038 -12700.4299197
  42091.02958094 -43247.89499348  18837.18843812  75054.78928764
 -18865.50886112   2127.27334362]


In [121]:
# LASSO (L1 regularization)


lasso_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("model", Lasso(max_iter=50000))
])

lasso_grid = GridSearchCV(
    lasso_pipe,
    param_grid={"model__alpha": alphas},
    cv=5,
    scoring="neg_mean_squared_error"
)

lasso_grid.fit(X_train, y_train)

print("\nBest Lasso alpha:", lasso_grid.best_params_["model__alpha"])
print("Best Lasso CV neg-MSE:", lasso_grid.best_score_)

# Evaluate best Lasso model on test set
best_lasso = lasso_grid.best_estimator_

lasso_train_mse = mean_squared_error(y_train, best_lasso.predict(X_train))
lasso_test_mse  = mean_squared_error(y_test,  best_lasso.predict(X_test))

print("\nLasso Results")
print("Train MSE:", lasso_train_mse)
print("Test  MSE:", lasso_test_mse)

# Observe coefficients (L1 can set some coefficients exactly to 0 -> feature selection)
lasso_coef = best_lasso.named_steps["model"].coef_
print("First 10 Lasso coefficients:", lasso_coef[:20])


Best Lasso alpha: 23.357214690901213
Best Lasso CV neg-MSE: -4711053969.781619

Lasso Results
Train MSE: 4683266248.712713
Test  MSE: 4902411232.053286
First 10 Lasso coefficients: [-53151.16767293 -53737.25044649  13874.29111623 -12604.10922936
  42537.7135272  -43232.02067988  18272.97738518  75052.44351367
 -18706.9794461    2099.95250623  -1545.01658692   1164.6138847 ]


In [122]:
print("Test MSE Comparison (lower is better)")
print("Ridge Test MSE:", ridge_test_mse)
print("Lasso Test MSE:", lasso_test_mse)

Test MSE Comparison (lower is better)
Ridge Test MSE: 4900076762.299355
Lasso Test MSE: 4902411232.053286


In [123]:
# MSE comparison:
# Both Ridge and Lasso models show similar training and test MSE values.
# Compared to the baseline model, the gap between training and test MSE
# is reduced, indicating improved generalization.
# This suggests that regularization helps control overfitting.


In [124]:
# (a) How L1 / L2 reduces variance and prevents overfitting

# By shrinking coefficient values, the model becomes less sensitive
# to noise in the training data, which helps prevent overfitting.
# Ridge reduces variance by shrinking all coefficients, while Lasso
# reduces variance by shrinking and sometimes removing features entirely.


In [125]:
# (b) How excessive regularization may increase bias and underfit

# If the regularization strength (alpha) is too large, the model
# becomes overly simple. Also, Important features may be overly penalized or removed, leading to higher bias and underfitting.
# This results in higher errors on both training and test datasets.

# **Task 2**

In [126]:
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Baseline Model

In [127]:
# Baseline Logistic Regression (no explicit regularization tuning)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

baseline_logreg = Pipeline([
    ("scaler", StandardScaler()), # 1) StandardScaler -> scales features (important for Logistic Regression)
    ("model", LogisticRegression(max_iter=10000))
])

# Training the model on the training set
baseline_logreg.fit(X_train, y_train)

# Observing coefficients of the logistic regression model
coefficients = baseline_logreg.named_steps["model"].coef_

print("Coefficient shape:", coefficients.shape)
print("First 10 coefficients:", coefficients[0][:30])

Coefficient shape: (1, 30)
First 10 coefficients: [-0.43190368 -0.38732553 -0.39343248 -0.46521006 -0.07166728  0.54016395
 -0.8014581  -1.11980408  0.23611852  0.07592093 -1.26817815  0.18887738
 -0.61058302 -0.9071857  -0.31330675  0.68249145  0.17527452 -0.3112999
  0.50042502  0.61622993 -0.87984024 -1.35060559 -0.58945273 -0.84184594
 -0.54416967  0.01611019 -0.94305313 -0.77821726 -1.20820031 -0.15741387]


In [128]:
# Observation of coefficients:
# The coefficients indicate the influence of each feature on the predicted class.
# A positive coefficient increases the probability of the positive class,
# while a negative coefficient decreases it.
# Large coefficient values indicate features with stronger influence.


In [129]:
# Predictions
train_pred = baseline_logreg.predict(X_train)
test_pred  = baseline_logreg.predict(X_test)

# Accuracy scores
train_acc = accuracy_score(y_train, train_pred)
test_acc  = accuracy_score(y_test, test_pred)

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

Training Accuracy: 0.9868131868131869
Test Accuracy: 0.9736842105263158


In [130]:
# Training accuracy is much higher than test accuracy, indicating overfitting.

Hyperparamter Tuning

In [108]:
# Hyperparameter Tuning using GridSearchCV (Logistic Regression)

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Defining grid of hyperparameters
C_values = np.logspace(-4, 4, 20)  # 10^-4 to 10^2(20 values) # - small C  -> strong regularization # - large C  -> weak regularization

param_grid = {
    "model__C": C_values, # Regularization strength values to try during grid search
    "model__penalty": ["l1", "l2"] # Type of regularizations
}

logreg_pipe = Pipeline([
    ("scaler", StandardScaler()), # Standardizes features of each column by calculating each mean and sd deviation [(x-mean)/std ]
    ("model", LogisticRegression(
        solver="liblinear",  # algorithmn that supports both l1 and l2
        max_iter=10000
    ))
])

# 2) GridSearchCV with cross-validation on training set
logreg_grid = GridSearchCV(
    logreg_pipe,
    param_grid=param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring="accuracy"
)

logreg_grid.fit(X_train, y_train)

print("Best hyperparameters:", logreg_grid.best_params_) #?
print("Best CV accuracy:", logreg_grid.best_score_)

# 3) Evaluating best model on test set
best_logreg = logreg_grid.best_estimator_

test_pred = best_logreg.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("Test Accuracy:", test_acc)


Best hyperparameters: {'model__C': np.float64(0.08858667904100823), 'model__penalty': 'l2'}
Best CV accuracy: 0.9780219780219781
Test Accuracy: 0.9912280701754386


In [109]:
# After hyperparameter tuning using GridSearchCV, the test accuracy improved
# from the baseline model i.e. from 97% to 99%. The optimized logistic regression model achieved
# higher accuracy on the test set, indicating better generalization to unseen data.

Regularization

In [110]:
# Regularization Experiments (L1 vs L2)

#Train L2 (Ridge-like)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np

# Using optimal C from GridSearchCV
best_C = 0.08858667904100823

# L2 Regularization (Ridge-like)
l2_model = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        C=best_C,
        penalty="l2",
        solver="liblinear",
        max_iter=10000
    ))
])

# Train L2 model
l2_model.fit(X_train, y_train)

# Predictions
l2_train_pred = l2_model.predict(X_train)
l2_test_pred  = l2_model.predict(X_test)

# Accuracy
l2_train_acc = accuracy_score(y_train, l2_train_pred)
l2_test_acc  = accuracy_score(y_test, l2_test_pred)

print("L2 Train Accuracy:", l2_train_acc)
print("L2 Test Accuracy:", l2_test_acc)

# Coefficients
l2_coef = l2_model.named_steps["model"].coef_[0]
print("First 10 L2 coefficients:", l2_coef[:10])


L2 Train Accuracy: 0.9824175824175824
L2 Test Accuracy: 0.9912280701754386
First 10 L2 coefficients: [-0.35044312 -0.38779711 -0.34126498 -0.36576375 -0.12469509  0.01131365
 -0.36227677 -0.46083554 -0.03397208  0.15606202]


In [111]:
# Observation (L2 Regularization):
# L2 regularization shrinks all coefficients toward zero but rarely makes them
# exactly zero. All features remain in the model, but their influence is reduced.
# This helps control model complexity and reduce overfitting.

In [112]:
# L1 Regularization (Lasso-like)
l1_model = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        C=best_C,
        penalty="l1",
        solver="liblinear",
        max_iter=10000
    ))
])

# Train L1 model
l1_model.fit(X_train, y_train)

# Predictions
l1_train_pred = l1_model.predict(X_train)
l1_test_pred  = l1_model.predict(X_test)

# Accuracy
l1_train_acc = accuracy_score(y_train, l1_train_pred)
l1_test_acc  = accuracy_score(y_test, l1_test_pred)

print("\nL1 Train Accuracy:", l1_train_acc)
print("L1 Test Accuracy:", l1_test_acc)

# Coefficients
l1_coef = l1_model.named_steps["model"].coef_[0]
print("First 10 L1 coefficients:", l1_coef[:10])

# Count zero coefficients (sparsity)
print("Number of zero coefficients (L1):",
      np.sum(l1_coef == 0), "out of", len(l1_coef))



L1 Train Accuracy: 0.9802197802197802
L1 Test Accuracy: 0.9649122807017544
First 10 L1 coefficients: [ 0.          0.          0.          0.          0.          0.
  0.         -0.99384906  0.          0.        ]
Number of zero coefficients (L1): 22 out of 30


In [113]:
# Observation (L1 Regularization):
# L1 regularization encourages sparsity by pushing some coefficients exactly to zero.
# This effectively performs feature selection by removing less important features.
# Compared to L2, L1 produces a simpler and more interpretable model.

In [114]:
print("Accuracy Comparison")
print("L2 -> Train:", l2_train_acc, " Test:", l2_test_acc)
print("L1 -> Train:", l1_train_acc, " Test:", l1_test_acc)

Accuracy Comparison
L2 -> Train: 0.9824175824175824  Test: 0.9912280701754386
L1 -> Train: 0.9802197802197802  Test: 0.9649122807017544


In [115]:
# Accuracy comparison:
# L2 regularization achieves higher test accuracy compared to L1.
# This indicates better generalization when all features are retained
# with reduced influence.
# L1 regularization leads to lower test accuracy since, L1 tries to push coefficients to exactly zero which suggesting that
# removing some features increases bias and causes slight underfitting.


In [116]:
# Observation:
# L1 and L2 regularization both reduce variance by limiting model complexity.
# However, L1 enforces sparsity by removing features, which can increase bias if important features are discarded.
# In this experiment, L2 provides a better bias–variance balance, while L1 introduces higher bias and reduces test accuracy
# which indicates that most features are informative and should not be removed.