# Worksheet-7

Part 1: Regression Task – California Housing Dataset

Task 1: Load and Split Dataset (80% for training and 20% for test)

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
data = pd.read_csv(url)

# Target variable
y = data["median_house_value"]

# Feature matrix
X = data.drop(columns=["median_house_value"])

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape)
print("Test samples:", X_test.shape)

Training samples: (16512, 9)
Test samples: (4128, 9)


### Preprocessing
Missing values were handled using SimpleImputer as LinearRegression does not support NaNs.


In [102]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

categorical_features = ["ocean_proximity"]
numeric_features = X.drop(columns=categorical_features).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

Task 2 – Step 1: Baseline Linear Regression (No Regularization)

In [103]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Baseline model
linear_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])

# Train model
linear_model.fit(X_train, y_train)

# Predictions
y_train_pred = linear_model.predict(X_train)
y_test_pred = linear_model.predict(X_test)

# MSE
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))


linear_model.named_steps["model"].coef_

Training MSE: 4683203783.504253
Test MSE: 4908290571.346397


array([-2.68382734e+04, -2.54683520e+04,  1.10218508e+03, -6.02150567e+00,
        1.02789395e+02, -3.81729064e+01,  4.82527528e+01,  3.94739752e+04,
       -3.97866562e+04,  1.36125073e+05, -5.13664222e+03,  3.43114007e+03])

Step 2: Hyperparameter Tuning (Ridge & Lasso)

Ridge Regression (L2)

In [104]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", Ridge())
])

param_grid = {
    "model__alpha": [0.1, 1, 10, 100]
}

ridge_cv = GridSearchCV(
    ridge_pipeline,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error"
)

ridge_cv.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_cv.best_params_)

ridge_best = ridge_cv.best_estimator_

print("Ridge Training MSE:",
      mean_squared_error(y_train, ridge_best.predict(X_train)))
print("Ridge Test MSE:",
      mean_squared_error(y_test, ridge_best.predict(X_test)))

Best Ridge alpha: {'model__alpha': 1}
Ridge Training MSE: 4683383574.687478
Ridge Test MSE: 4909851273.941725


Lasso Regression (L1)

In [105]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", Lasso(max_iter=5000))
])

param_grid = {
    "model__alpha": [0.001, 0.01, 0.1, 1]
}

lasso_cv = GridSearchCV(
    lasso_pipeline,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error"
)

lasso_cv.fit(X_train, y_train)

print("Best Lasso alpha:", lasso_cv.best_params_)

lasso_best = lasso_cv.best_estimator_

print("Lasso Training MSE:",
      mean_squared_error(y_train, lasso_best.predict(X_train)))
print("Lasso Test MSE:",
      mean_squared_error(y_test, lasso_best.predict(X_test)))

Best Lasso alpha: {'model__alpha': 0.001}
Lasso Training MSE: 4683203783.508414
Lasso Test MSE: 4908290765.821696


Step 3: Regularization Comparison (L1 vs L2)

In [106]:
import numpy as np

ridge_coeffs = ridge_best.named_steps["model"].coef_
lasso_coeffs = lasso_best.named_steps["model"].coef_

print("Zero coefficients (Ridge):", np.sum(ridge_coeffs == 0))
print("Zero coefficients (Lasso):", np.sum(lasso_coeffs == 0))

Zero coefficients (Ridge): 0
Zero coefficients (Lasso): 0


Part 2: Classification Task – Breast Cancer Dataset

Task 1: Load and Split Dataset

In [107]:
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape)
print("Test samples:", X_test.shape)

Training samples: (455, 30)
Test samples: (114, 30)


Step 1: Baseline Logistic Regression

In [108]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

print("Training Accuracy:",
      accuracy_score(y_train, log_reg.predict(X_train)))
print("Test Accuracy:",
      accuracy_score(y_test, log_reg.predict(X_test)))

log_reg.coef_

Training Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193


array([[ 1.0274368 ,  0.22145051, -0.36213488,  0.0254667 , -0.15623532,
        -0.23771256, -0.53255786, -0.28369224, -0.22668189, -0.03649446,
        -0.09710208,  1.3705667 , -0.18140942, -0.08719575, -0.02245523,
         0.04736092, -0.04294784, -0.03240188, -0.03473732,  0.01160522,
         0.11165329, -0.50887722, -0.01555395, -0.016857  , -0.30773117,
        -0.77270908, -1.42859535, -0.51092923, -0.74689363, -0.10094404]])

Step 2: Hyperparameter Tuning (L1 & L2)

In [109]:
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

log_reg_cv = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    cv=5,
    scoring="accuracy"
)

log_reg_cv.fit(X_train, y_train)

print("Best parameters:", log_reg_cv.best_params_)



best_log_reg = log_reg_cv.best_estimator_

print("Training Accuracy:",
      accuracy_score(y_train, best_log_reg.predict(X_train)))
print("Test Accuracy:",
      accuracy_score(y_test, best_log_reg.predict(X_test)))

Best parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Training Accuracy: 0.9692307692307692
Test Accuracy: 0.956140350877193


Step 3: L1 vs L2 Comparison

In [110]:
log_l1 = LogisticRegression(
    penalty="l1",
    C=log_reg_cv.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l2 = LogisticRegression(
    penalty="l2",
    C=log_reg_cv.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l1.fit(X_train, y_train)
log_l2.fit(X_train, y_train)



print("L1 Training Accuracy:",
      accuracy_score(y_train, log_l1.predict(X_train)))
print("L1 Test Accuracy:",
      accuracy_score(y_test, log_l1.predict(X_test)))

print("L2 Training Accuracy:",
      accuracy_score(y_train, log_l2.predict(X_train)))
print("L2 Test Accuracy:",
      accuracy_score(y_test, log_l2.predict(X_test)))



import numpy as np
print("Zero coefficients (L1):", np.sum(log_l1.coef_ == 0))
print("Zero coefficients (L2):", np.sum(log_l2.coef_ == 0))


L1 Training Accuracy: 0.9824175824175824
L1 Test Accuracy: 0.9736842105263158
L2 Training Accuracy: 0.9692307692307692
L2 Test Accuracy: 0.956140350877193
Zero coefficients (L1): 14
Zero coefficients (L2): 0
