In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

toyota = cut_outliers.copy()

def ridge_selection(X_train, X_test, y_train, y_test, min_alpha=0, max_alpha=4, steps=100, n_worst=5):
    clf = Ridge()

    # Generate values for `alpha` that are evenly distributed on a logarithmic scale
    alphas = np.logspace(min_alpha, max_alpha, steps)
    coefs = []
    rmse_list = []
    r2_list = []

    # Train the model with different regularisation strengths
    for a in alphas:
        clf.set_params(alpha=a).fit(X_train, y_train)
        coefs.append(clf.coef_)
        y_pred = clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse_list.append(np.sqrt(mse))
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    alphas = pd.Index(alphas, name="alpha")
    coefs = pd.DataFrame(coefs, index=alphas, columns=[f"{name}" for _, name in enumerate(X_train.columns)])
    coefs_plot(coefs)
    rmse_plot(alphas, rmse_list)
    r2_plot(alphas, r2_list)
    worst_coefs = print_coefs(coefs, rmse_list).tail(n_worst).index.tolist()
    return worst_coefs

def coefs_plot(coefs):
    sns.lineplot(data=coefs)
    plt.xscale("log")
    plt.show()

def rmse_plot(alphas, rmse_list):
    sns.lineplot(x=alphas, y=rmse_list)
    plt.xscale("log")
    plt.ylabel("RMSE")
    # Find and mark the minimum MSE point
    min_mse_idx = np.argmin(rmse_list)
    min_alpha = alphas[min_mse_idx]
    min_mse = rmse_list[min_mse_idx]
    plt.plot(min_alpha, min_mse, 'ro', label=f'Min RMSE: {min_mse:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def r2_plot(alphas, r2_list):
    sns.lineplot(x=alphas, y=r2_list)
    plt.xscale("log")
    plt.ylabel("R2")
    min_r2_idx = np.argmax(r2_list)
    min_alpha = alphas[min_r2_idx]
    min_r2 = r2_list[min_r2_idx]
    plt.plot(min_alpha, min_r2, 'ro', label=f'Min R2: {min_r2:.2f}\nAlpha: {min_alpha:.2e}')
    plt.legend()
    plt.show()

def print_coefs(coefs, mse_list):
    min_mse_idx = np.argmin(mse_list)
    coefs_df = np.abs(coefs.iloc[min_mse_idx]).sort_values(ascending=False)
    print(coefs_df)
    return coefs_df

Vamos a hacer la primera ejecucion

In [None]:
X = toyota.drop(columns=["Price"], axis=1)
y = toyota["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
ridge_selection(X_train, X_test, y_train, y_test, min_alpha=-10, max_alpha=10, steps=100)

Ajustamos el logspace de prueba, queremos un valor de alpha mas alto que nos permita sacar mas variables sin empeorar demasiado el modelo.

In [None]:
ridge_selection(X_train, X_test, y_train, y_test, min_alpha=0, max_alpha=5, steps=200)

Queremos iterar multiples veces eliminando las peores variables en cada una. Vamos a eliminar en cada iteracion las peores cinco variables y corroborar los mejores resultados.

Vamos a ajustar alpha a 0 como minimo para el logspace. Fue uno de los mejores valores probados.

In [None]:
def iterate_ridge_selection(df, min_alpha=-10, max_alpha=10, steps=100, drop_coefs=5):
    X = df.drop(columns=["Price"], axis=1)
    y = df["Price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    while len(X.columns) > drop_coefs:
        worst_coefs = ridge_selection(X_train, X_test, y_train, y_test, min_alpha, max_alpha, steps, n_worst=drop_coefs)
        X = X.drop(columns=worst_coefs, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

iterate_ridge_selection(toyota, min_alpha=0, max_alpha=4, steps=100, drop_coefs=5)

Como podemos ver se puede quitar variables sin que afecte demasiado al modelo. Por el criterio de Navaja de Ockham queremos quedarnos con el modelo más simple. Vamos a probar iterando desde el modelo de 15 variables y sacando de a una variable.

In [None]:
fifteen_df = toyota.drop(columns=['m_exec', 'Metallic_Rim', 'Sport_Model', 'm_vvti', 'm_16v', 'Radio', 'Met_Color',
                              'Central_Lock', 'm_luna', 'm_terra', 'm_sport', 'Airbag_2', 'Airbag_1', 'ABS', 'Tow_Bar',
                              'Automatic', 'm_wagon', 'Mistlamps', 'm_sedan', 'BOVAG_Guarantee', 'Backseat_Divider',
                              'm_sol', 'Airco', 'Power_Steering', 'cc',
                              'Boardcomputer', 'CD_Player', 'Gears'], axis=1)
iterate_ridge_selection(fifteen_df, min_alpha=0, max_alpha=4, steps=20, drop_coefs=1)

Tan solo en la primera iteracion el modelo ya empeora el RMSE en 30 unidades. Consideramos que lo ideal puede ser quedarse con las 15 features, considerando el aumento de RMSE al eliminar mas.

Mfg_Year            7799.164414
Weight              4232.299238
KM                  4193.445233
m_vvtli             2728.438711
Quarterly_Tax       2387.143645
Automatic_airco     2106.063777
CNG                 1824.149363
Diesel              1678.914642
m_d4d               1252.832214
Guarantee_Period     986.240697
m_matic3             711.423817
m_gtsi               586.591210
m_g6                 561.510540
m_liftb              536.806869
m_bns                514.864141
Powered_Windows      474.923391
m_hatch_b            415.774423
m_matic4             415.249804
m_comfort            412.660478
Mfr_Guarantee        363.974532
HP                   133.164061

In [None]:
def ridge_simple(X, y, features, alpha):
    X = X[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    
    clf = Ridge()
    
    clf.set_params(alpha=alpha).fit(X_train, y_train)
    coef = clf.coef_
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print("Features: ", len(features))
    print("RMSE: ",rmse)
    print("R²: ",r2)
    return clf

Veamos si podemos mejorar limpiando outliers

In [None]:
# Identificador de outliers
def ridge_clean_outliers(selected_columns, alpha, outliers):
    print("Antes: ", toyota.shape)
    df_2 = toyota.drop(outliers, axis=0)
    print("Despues: ", df_2.shape)
    
    X = df_2.drop(columns=["Price"], axis=1)
    y = df_2["Price"]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    model = ridge_simple(X, y, features=selected_columns, alpha=alpha)

    from toyota.utils_ridge import SklearnRidgeDiagnostic
    diagnosticPlotter = SklearnRidgeDiagnostic(model, X[selected_columns], y, selected_columns)
    diagnosticPlotter()

    # Convert each outlier list (list of numpy arrays) to a DataFrame
    outlier_dfs = [
        pd.DataFrame(diagnosticPlotter.residuals_vs_fitted_outliers),
        pd.DataFrame(diagnosticPlotter.qq_plot_outliers),
        pd.DataFrame(diagnosticPlotter.scale_location_outliers),
        pd.DataFrame(diagnosticPlotter.leverage_plot_outliers)
    ]

    # Armamos una lista con todos los valores únicos de outlier_dfs
    all_outlier_values = []
    for df in outlier_dfs:
        all_outlier_values.extend(df.values.flatten())
    # Eliminamos duplicados y NaNs, y convertimos a enteros si corresponde
    all_outlier_values = [int(x) for x in set(all_outlier_values) if not pd.isnull(x)]

    # Para cada idx en all_outlier_values, buscamos el registro en X y luego su índice en toyota (sin "Price")
    toyota_no_price = toyota.drop(columns=["Price"], errors="ignore")

    global_indexes = []
    for idx in all_outlier_values:
        row = X[selected_columns].iloc[idx]
        # Buscamos la(s) fila(s) en toyota_no_price que coincidan exactamente con row
        mask = (toyota_no_price[selected_columns] == row.values).all(axis=1)
        # matching_indexes contiene los índices de toyota_no_price donde hay coincidencia exacta
        matching_indexes = toyota_no_price.index[mask].tolist()
        global_indexes.extend(matching_indexes)
        
    # Eliminamos duplicados
    global_indexes = list(set(global_indexes))
    print("Índices globales correspondientes a los outliers:", global_indexes)

In [None]:
features = ['Mfg_Year', 'Weight', 'KM', 'm_vvtli', 'Quarterly_Tax', 'Automatic_airco', 'CNG', 'Diesel', 'm_d4d', 'Guarantee_Period', 'm_matic3', 'm_gtsi', 'm_g6', 'm_liftb', 'm_bns', 'Powered_Windows', 'm_hatch_b', 'm_matic4', 'm_comfort', 'Mfr_Guarantee', 'HP']
outliers = [1131, 1133] # mejora de -50 rmse y +0.0087 r2
ridge_clean_outliers(features, 1, outliers)

Metamos en Ridge las features que obtuvimos en el manual feature selection:

In [None]:
features =  ['Mfg_Year',
'Weight',
'm_vvtli',
'KM',
'Quarterly_Tax',
'CNG',
'Automatic_airco',
'Guarantee_Period',
'cc',
'HP']
outliers = []
ridge_clean_outliers(features, 0.4, outliers)