In [None]:
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

toyota = ln_transform.copy()

def lasso_selection(X_train, X_test, y_train, y_test, min_alpha=0, max_alpha=4, steps=100):
    clf = Lasso()

    # Generate values for `alpha` that are evenly distributed on a logarithmic scale
    alphas = np.logspace(min_alpha, max_alpha, steps)
    coefs = []
    rmse_list = []
    r2_list = []

    # Train the model with different regularisation strengths
    for a in alphas:
        clf.set_params(alpha=a).fit(X_train, y_train)
        coefs.append(clf.coef_)
        y_pred = clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse_list.append(np.sqrt(mse))
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    alphas = pd.Index(alphas, name="alpha")
    coefs = pd.DataFrame(coefs, index=alphas, columns=[f"{name}" for _, name in enumerate(X_train.columns)])
    coefs_plot(coefs)
    rmse_plot(alphas, rmse_list)
    r2_plot(alphas, r2_list)
    ordered_coefs = print_coefs(coefs, rmse_list)
    worst_coefs = ordered_coefs[ordered_coefs == 0].index.tolist()
    return worst_coefs

def coefs_plot(coefs):
    sns.lineplot(data=coefs)
    plt.xscale("log")
    plt.show()

def rmse_plot(alphas, rmse_list):
    sns.lineplot(x=alphas, y=rmse_list)
    plt.xscale("log")
    plt.ylabel("RMSE")
    # Find and mark the minimum MSE point
    min_rmse_idx = np.argmin(rmse_list)
    min_alpha = alphas[min_rmse_idx]
    min_rmse = rmse_list[min_rmse_idx]
    plt.plot(min_alpha, min_rmse, 'ro', label=f'Min RMSE: {min_rmse:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def r2_plot(alphas, r2_list):
    sns.lineplot(x=alphas, y=r2_list)
    plt.xscale("log")
    plt.ylabel("R2")
    min_r2_idx = np.argmax(r2_list)
    min_alpha = alphas[min_r2_idx]
    min_r2 = r2_list[min_r2_idx]
    plt.plot(min_alpha, min_r2, 'ro', label=f'Min R2: {min_r2:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def print_coefs(coefs, mse_list):
    min_mse_idx = np.argmin(mse_list)
    coefs_df = np.abs(coefs.iloc[min_mse_idx]).sort_values(ascending=False)
    print(coefs_df)
    return coefs_df

Vamos a hacer la primera ejecucion

In [None]:
X = toyota.drop(columns=["Price"], axis=1)
y = toyota["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lasso_selection(X_train, X_test, y_train, y_test, min_alpha=-10, max_alpha=10, steps=200)

Queremos iterar multiples veces eliminando las peores variables en cada una. Vamos a eliminar en cada iteracion las peores cinco variables y corroborar los mejores resultados.

In [None]:
def iterate_lasso_selection(df, min_alpha=-10, max_alpha=10, steps=200):
    X = df.drop(columns=["Price"], axis=1)
    y = df["Price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    while True:
        worst_coefs = lasso_selection(X_train, X_test, y_train, y_test, min_alpha, max_alpha, steps)
        if len(worst_coefs) == 0:
            break
        X = X.drop(columns=worst_coefs, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

iterate_lasso_selection(toyota, min_alpha=-10, max_alpha=10, steps=100)