In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

toyota = cut_outliers.copy()

def ridge_selection(X_train, X_test, y_train, y_test, min_alpha=0, max_alpha=4, steps=100, n_worst=5):
    clf = Ridge()

    # Generate values for `alpha` that are evenly distributed on a logarithmic scale
    alphas = np.logspace(min_alpha, max_alpha, steps)
    coefs = []
    rmse_list = []
    r2_list = []

    # Train the model with different regularisation strengths
    for a in alphas:
        clf.set_params(alpha=a).fit(X_train, y_train)
        coefs.append(clf.coef_)
        y_pred = clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse_list.append(np.sqrt(mse))
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    alphas = pd.Index(alphas, name="alpha")
    coefs = pd.DataFrame(coefs, index=alphas, columns=[f"{name}" for _, name in enumerate(X_train.columns)])
    coefs_plot(coefs)
    rmse_plot(alphas, rmse_list)
    r2_plot(alphas, r2_list)
    worst_coefs = print_coefs(coefs, rmse_list).tail(n_worst).index.tolist()
    return worst_coefs

def coefs_plot(coefs):
    sns.lineplot(data=coefs)
    plt.xscale("log")
    plt.show()

def rmse_plot(alphas, rmse_list):
    sns.lineplot(x=alphas, y=rmse_list)
    plt.xscale("log")
    plt.ylabel("RMSE")
    # Find and mark the minimum MSE point
    min_mse_idx = np.argmin(rmse_list)
    min_alpha = alphas[min_mse_idx]
    min_mse = rmse_list[min_mse_idx]
    plt.plot(min_alpha, min_mse, 'ro', label=f'Min RMSE: {min_mse:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def r2_plot(alphas, r2_list):
    sns.lineplot(x=alphas, y=r2_list)
    plt.xscale("log")
    plt.ylabel("R2")
    min_r2_idx = np.argmax(r2_list)
    min_alpha = alphas[min_r2_idx]
    min_r2 = r2_list[min_r2_idx]
    plt.plot(min_alpha, min_r2, 'ro', label=f'Min R2: {min_r2:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def print_coefs(coefs, mse_list):
    min_mse_idx = np.argmin(mse_list)
    coefs_df = np.abs(coefs.iloc[min_mse_idx]).sort_values(ascending=False)
    print(coefs_df)
    return coefs_df

Vamos a hacer la primera ejecucion

In [None]:
X = toyota.drop(columns=["Price"], axis=1)
y = toyota["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
ridge_selection(X_train, X_test, y_train, y_test, min_alpha=-10, max_alpha=10, steps=100)

Ajustamos el logspace de prueba, queremos un valor de alpha mas alto que nos permita sacar mas variables sin empeorar demasiado el modelo.

In [None]:
ridge_selection(X_train, X_test, y_train, y_test, min_alpha=0, max_alpha=5, steps=200)

Queremos iterar multiples veces eliminando las peores variables en cada una. Vamos a eliminar en cada iteracion las peores cinco variables y corroborar los mejores resultados.

Vamos a ajustar alpha a 0 como minimo para el logspace. Fue uno de los mejores valores probados.

In [None]:
def iterate_ridge_selection(df, min_alpha=-10, max_alpha=10, steps=100, drop_coefs=5):
    X = df.drop(columns=["Price"], axis=1)
    y = df["Price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    while len(X.columns) > drop_coefs:
        worst_coefs = ridge_selection(X_train, X_test, y_train, y_test, min_alpha, max_alpha, steps, n_worst=drop_coefs)
        X = X.drop(columns=worst_coefs, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

iterate_ridge_selection(toyota, min_alpha=0, max_alpha=4, steps=100, drop_coefs=5)

Como podemos ver se puede quitar variables sin que afecte demasiado al modelo. Por el criterio de Navaja de Ockham queremos quedarnos con el modelo más simple. Vamos a probar iterando desde el modelo de 15 variables y sacando de a una variable.

In [None]:
fifteen_df = toyota.drop(columns=['m_exec', 'Metallic_Rim', 'Sport_Model', 'm_vvti', 'm_16v', 'HP', 'Radio', 'Met_Color',
                              'Central_Lock', 'm_luna', 'm_terra', 'm_sport', 'Airbag_2', 'Airbag_1', 'ABS', 'Tow_Bar',
                              'Automatic', 'm_wagon', 'Mistlamps', 'm_sedan', 'BOVAG_Guarantee', 'Backseat_Divider',
                              'm_matic4', 'm_sol', 'Airco', 'm_comfort', 'Power_Steering', 'Mfr_Guarantee', 'cc',
                              'Boardcomputer', 'm_hatch_b', 'CD_Player', 'Gears',
                              'm_gtsi', 'm_g6'], axis=1)
iterate_ridge_selection(fifteen_df, min_alpha=0, max_alpha=4, steps=200, drop_coefs=1)

Tan solo en la primera iteracion el modelo ya empeora el RMSE en 30 unidades. Consideramos que lo ideal puede ser quedarse con las 15 features, considerando el aumento de RMSE al eliminar mas.