Title: Regression Models

Ridge & Lasso Regression (Regularization Techniques)


Task 1: Use Ridge regression on a dataset with multicollinearity to compare results with linear regression.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score

# Simulated dataset with multicollinearity
np.random.seed(0)
n_samples = 100

X1 = np.random.rand(n_samples)
X2 = X1 + np.random.normal(0, 0.01, n_samples)  # Highly correlated with X1
X3 = np.random.rand(n_samples)
X = np.column_stack((X1, X2, X3))

# Target variable
y = 3*X1 + 2*X2 + 1.5*X3 + np.random.normal(0, 0.1, n_samples)

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
y_pred_lin = lin_reg.predict(X)

# Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X, y)
y_pred_ridge = ridge_reg.predict(X)

# Results
print("Linear Regression Coefficients:", lin_reg.coef_)
print("Ridge Regression Coefficients:", ridge_reg.coef_)

print("R² Score - Linear Regression:", r2_score(y, y_pred_lin))
print("R² Score - Ridge Regression:", r2_score(y, y_pred_ridge))


Task 2: Implement Lasso regression and observe effect on feature selection in feature rich data.

In [None]:
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt

# Generate synthetic data with many features (some irrelevant)
X, y, coef_true = make_regression(
    n_samples=100, n_features=20, n_informative=5, noise=0.1, coef=True, random_state=42
)

# Fit Lasso regression
lasso = Lasso(alpha=0.1)  # alpha controls regularization strength
lasso.fit(X, y)

# Print true vs Lasso estimated coefficients
print("True coefficients:\n", coef_true)
print("\nLasso coefficients:\n", lasso.coef_)

# Visualize coefficients
plt.figure(figsize=(10, 6))
plt.plot(coef_true, label="True coefficients", marker='o')
plt.plot(lasso.coef_, label="Lasso coefficients", marker='x')
plt.xlabel("Feature index")
plt.ylabel("Coefficient value")
plt.title("Lasso Regression: Feature Selection Effect")
plt.legend()
plt.grid(True)
plt.show()



Task 3: Regularization effects visualization with model complexity and performance metrics comparison.

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=10, noise=20, random_state=42)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Range of alphas to test
alphas = np.logspace(-3, 3, 50)

ridge_coefs = []
lasso_coefs = []
ridge_r2 = []
lasso_r2 = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    ridge_coefs.append(ridge.coef_)
    ridge_r2.append(r2_score(y_test, ridge.predict(X_test)))
    
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)
    lasso_coefs.append(lasso.coef_)
    lasso_r2.append(r2_score(y_test, lasso.predict(X_test)))

ridge_coefs = np.array(ridge_coefs)
lasso_coefs = np.array(lasso_coefs)

# Plot coefficients paths
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
for i in range(ridge_coefs.shape[1]):
    plt.plot(alphas, ridge_coefs[:, i], label=f'Feature {i}')
plt.xscale('log')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Coefficient value')
plt.title('Ridge Coefficients vs Regularization')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.grid(True)

plt.subplot(1, 2, 2)
for i in range(lasso_coefs.shape[1]):
    plt.plot(alphas, lasso_coefs[:, i], label=f'Feature {i}')
plt.xscale('log')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Coefficient value')
plt.title('Lasso Coefficients vs Regularization')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.grid(True)

plt.tight_layout()
plt.show()

# Plot R² scores vs alpha
plt.figure(figsize=(8, 5))
plt.plot(alphas, ridge_r2, label='Ridge R²')
plt.plot(alphas, lasso_r2, label='Lasso R²')
plt.xscale('log')
plt.xlabel('Alpha (log scale)')
plt.ylabel('R² Score')
plt.title('Model Performance vs Regularization Strength')
plt.legend()
plt.grid(True)
plt.show()
