In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
# 1. Load dataset
X, y = fetch_openml(
    name="california_housing",
    version=1,
    as_frame=True,
    return_X_y=True,
    parser="pandas" # Ensures consistent dataframe handling
)


In [4]:
# 2. Preprocessing
# Remove categorical feature as you intended
X = X.drop(columns=["ocean_proximity"])

In [5]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def create_pipeline(model):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')), # Fixes the NaN issue
        ('scaler', StandardScaler()),                 # Scales data for regularization
        ('regressor', model)
    ])

In [6]:
# 4. Baseline Linear Regression
lr_pipe = create_pipeline(LinearRegression())
lr_pipe.fit(X_train, y_train)
print(f"Baseline Test MSE: {mean_squared_error(y_test, lr_pipe.predict(X_test)):.2f}")

Baseline Test MSE: 5059656033.13


In [7]:
# 5. Ridge with Cross-Validation
ridge_pipe = create_pipeline(Ridge())
ridge_cv = GridSearchCV(
    ridge_pipe,
    {'regressor__alpha': [0.1, 1, 10, 100]},
    cv=5,
    scoring='neg_mean_squared_error'
)
ridge_cv.fit(X_train, y_train)
print(f"Best Ridge alpha: {ridge_cv.best_params_['regressor__alpha']}")

Best Ridge alpha: 10


In [8]:
# 6. Lasso with Cross-Validation
lasso_pipe = create_pipeline(Lasso(max_iter=10000))
lasso_cv = GridSearchCV(
    lasso_pipe,
    {'regressor__alpha': [0.1, 1, 10, 100]},
    cv=5,
    scoring='neg_mean_squared_error'
)
lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_.named_steps['regressor']
print(f"Best Lasso alpha: {lasso_cv.best_params_['regressor__alpha']}")
print(f"Zero coefficients: {np.sum(best_lasso.coef_ == 0)}")

Best Lasso alpha: 10
Zero coefficients: 0


In [9]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [10]:
# 1. Load data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
def create_log_pipeline(penalty='l2', solver='lbfgs', C=1.0):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(penalty=penalty, solver=solver, C=C, max_iter=10000))
    ])

In [11]:

# 2. Baseline Logistic Regression (No Penalty)
log_reg = create_log_pipeline(penalty=None)
log_reg.fit(X_train, y_train)
print(f"Baseline Accuracy: {accuracy_score(y_test, log_reg.predict(X_test)):.4f}")

Baseline Accuracy: 0.9386


In [12]:
# 3. L1 vs L2 Regularization
log_l1 = create_log_pipeline(penalty='l1', solver='liblinear', C=0.5)
log_l2 = create_log_pipeline(penalty='l2', solver='lbfgs', C=0.5)

log_l1.fit(X_train, y_train)
log_l2.fit(X_train, y_train)

print(f"L1 Accuracy:       {accuracy_score(y_test, log_l1.predict(X_test)):.4f}")
print(f"L2 Accuracy:       {accuracy_score(y_test, log_l2.predict(X_test)):.4f}")

L1 Accuracy:       0.9737
L2 Accuracy:       0.9737


In [13]:
# 4. Compare Sparsity (Feature Selection)
l1_coefs = log_l1.named_steps['classifier'].coef_
l2_coefs = log_l2.named_steps['classifier'].coef_

print(f"\nL1 Zero Coefficients: {np.sum(l1_coefs == 0)} out of {l1_coefs.size}")
print(f"L2 Zero Coefficients: {np.sum(l2_coefs == 0)} out of {l2_coefs.size}")


L1 Zero Coefficients: 15 out of 30
L2 Zero Coefficients: 0 out of 30
