In [None]:
import pandas as pd, numpy as np
from sklearn.compose      import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline     import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.inspection   import permutation_importance
import matplotlib.pyplot as plt
import os
from pathlib import Path

In [None]:
def Ridge(
    X, y,
    alphas=None,
    cv=5,
    scoring='neg_root_mean_squared_error',
    top_n=30,
    perm_n=30,
    categorical=['Cl_73', 'Cl_74','Cl_75','Cl_76']
):
    """
    X : pd.DataFrame
    y : array-like
    categorical : list-like or None
        • None (default) → treat every column with dtype 'object' or 'category'
          as categorical.
        • list / tuple → columns to encode, no type check required.
    remaining args ... (same as before)
    """
    if alphas is None:
        alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

    if categorical is None:
        cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    else:
        cat_cols = list(categorical)
    num_cols = [c for c in X.columns if c not in cat_cols]

    pre = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ])

    model = Pipeline([
        ('pre',    pre),
        ('ridgecv', RidgeCV(alphas=alphas, cv=cv, scoring=scoring))
    ])

    model.fit(X, y)

    best_alpha = model.named_steps['ridgecv'].alpha_

    # expanded feature names
    ohe = model.named_steps['pre'].named_transformers_['cat']
    cat_names = ohe.get_feature_names_out(cat_cols) if cat_cols else np.array([])
    feat_names = np.concatenate([num_cols, cat_names])

    coeffs = pd.Series(model.named_steps['ridgecv'].coef_, index=feat_names)

    # plot
    abs_coefs = coeffs.abs().sort_values(ascending=False).head(top_n)
    plt.figure(figsize=(8, max(6, top_n * 0.2)))
    abs_coefs.plot(kind='barh'); plt.gca().invert_yaxis()
    plt.title(f'Top {top_n} Absolute Ridge Coefficients (alpha={best_alpha})')
    plt.xlabel('Absolute Coefficient'); plt.tight_layout(); plt.show()

    result = permutation_importance(model, X, y, n_repeats=10,
                                    random_state=0, scoring=scoring)
    perm_imp = pd.Series(result.importances_mean, index=X.columns)

    return model, best_alpha, coeffs, perm_imp

In [None]:
def save_data(
    perm_imp,
    coeffs,
    perm_n: int = 30,
    test_name: str = None
):

    outdir = Path("Ridge")
    outdir.mkdir(parents=True, exist_ok=True)

    perm_imp.to_csv(outdir / f"{test_name}_ridge_perm_imp.csv")
    coeffs.to_csv(outdir / f"{test_name}_ridge_coeffs.csv")

    top_perm = perm_imp.abs().sort_values(ascending=False).head(perm_n)
    h = max(6, len(top_perm) * 0.25)
    plt.figure(figsize=(8, h))
    top_perm.plot(kind="barh")
    plt.gca().invert_yaxis()
    plt.title(f"Top {perm_n} Permutation Importances")
    plt.xlabel("Mean decrease in neg RMSE")
    plt.tight_layout()
    plt.savefig(outdir / f"{test_name}_ridge_perm_imp.png", dpi=150, bbox_inches="tight")
    plt.show()