In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
# Load the dataset from a CSV file
Hitters = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ISLR/Hitters.csv')
Hitters.drop(['rownames'], axis=1, inplace=True)
df = Hitters

# Remove rows with missing values
df = df.dropna()

# Prepare the data for regression
X = pd.get_dummies(df.drop('Salary', axis=1), drop_first=True)
y = df['Salary']

In [4]:
# Split the data into training (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Perform cross-validation for Ridge regression on training data using scikit-learn
alphas = np.logspace(-6, 6, 200)
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X_train, y_train)

# Extract the best alpha value
best_alpha_ridge = ridge_cv.alpha_

# Fit the Ridge model using the best alpha on training data
ridge_model = RidgeCV(alphas=[best_alpha_ridge])
ridge_model.fit(X_train, y_train)

# Print the coefficients of the Ridge model
print("Ridge Coefficients (Best Alpha):", ridge_model.coef_)

TypeError: _BaseRidgeCV.__init__() got an unexpected keyword argument 'store_cv_values'

In [None]:
# Plotting the coefficients as a function of alpha for Ridge

plt.plot(np.log10(alphas), ridge_cv.cv_values_.mean(axis=0), label='Cross-Validation Error')
plt.axvline(np.log10(best_alpha_ridge), linestyle='--', color='r', label='Best Alpha')
plt.xlabel('Log Alpha')
plt.ylabel('Mean Squared Error')
plt.title('Ridge Path: Coefficients vs. Log Alpha')
plt.legend()

In [None]:
# Evaluate the Ridge model on the training data
ridge_predictions_train = ridge_model.predict(X_train)
ridge_mse_train = mean_squared_error(y_train, ridge_predictions_train)
ridge_rmse_train = np.sqrt(ridge_mse_train)

print("Ridge Training MSE (Best Alpha):", ridge_mse_train)
print("Ridge Training RMSE (Best Alpha):", ridge_rmse_train)

# Evaluate the Ridge model on the test data
ridge_predictions_test = ridge_model.predict(X_test)
ridge_mse_test = mean_squared_error(y_test, ridge_predictions_test)
ridge_rmse_test = np.sqrt(ridge_mse_test)

print("Ridge Test MSE (Best Alpha):", ridge_mse_test)
print("Ridge Test RMSE (Best Alpha):", ridge_rmse_test)

In [None]:
# Perform cross-validation for Lasso regression on training data using scikit-learn
lasso_cv = LassoCV(alphas=alphas, cv=10, random_state=123)
lasso_cv.fit(X_train, y_train)

# Extract the best alpha value
best_alpha_lasso = lasso_cv.alpha_

# Fit the Lasso model using the best alpha on training data
lasso_model = LassoCV(alphas=[best_alpha_lasso], cv=10)
lasso_model.fit(X_train, y_train)

# Print the coefficients of the Lasso model
print("Lasso Coefficients (Best Alpha):", lasso_model.coef_)

In [None]:
# Evaluate the Lasso model on the training data
lasso_predictions_train = lasso_model.predict(X_train)
lasso_mse_train = mean_squared_error(y_train, lasso_predictions_train)
lasso_rmse_train = np.sqrt(lasso_mse_train)

print("Lasso Training MSE (Best Alpha):", lasso_mse_train)
print("Lasso Training RMSE (Best Alpha):", lasso_rmse_train)

# Evaluate the Lasso model on the test data
lasso_predictions_test = lasso_model.predict(X_test)
lasso_mse_test = mean_squared_error(y_test, lasso_predictions_test)
lasso_rmse_test = np.sqrt(lasso_mse_test)

print("Lasso Test MSE (Best Alpha):", lasso_mse_test)
print("Lasso Test RMSE (Best Alpha):", lasso_rmse_test)

In [None]:
# Plotting the coefficients as a function of alpha for Lasso
plt.plot(np.log10(lasso_cv.alphas_), lasso_cv.mse_path_.mean(axis=1), label='Cross-Validation Error')
plt.axvline(np.log10(best_alpha_lasso), linestyle='--', color='r', label='Best Alpha')
plt.xlabel('Log Alpha')
plt.ylabel('Mean Squared Error')
plt.title('Lasso Path: Coefficients vs. Log Alpha')
plt.legend()

plt.tight_layout()
plt.show()