## Realizar optimizacion con optuna pero de todo el pipeline, desde imputacion, tolerancia, encoding hasta el modelo

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from feature_engine.imputation import (AddMissingIndicator, 
                                       MeanMedianImputer, 
                                       CategoricalImputer
                                       )
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, OrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import optuna
import lightgbm as lgb
from optuna.visualization import (plot_param_importances,
                                  plot_optimization_history,
                                  plot_parallel_coordinate,
                                  plot_slice,
                                  plot_contour)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error


In [30]:
data_gold = pd.read_excel("../data/gold/car_gold.xlsx")
## selected feature
features = ["price", "antiguedad", "kilometraje", "vehicle_brand", "vehicle_line", 'location_state']
data_gold = data_gold[features]
data_gold.head()



Unnamed: 0,price,antiguedad,kilometraje,vehicle_brand,vehicle_line,location_state
0,183900000,0.0,5800.0,byd,song,bogota_d.c.
1,64900000,8.0,59000.0,mazda,3,antioquia
2,72000000,6.0,47000.0,,,bogota_d.c.
3,59500000,0.0,3600.0,kia,picanto,bogota_d.c.
4,63000000,9.0,81000.0,nissan,x-trail,bogota_d.c.


In [31]:
## dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(data_gold.drop(labels=['price'], axis=1),
                                                    data_gold['price'],
                                                    test_size=0.3,
                                                    random_state=0) # reproducibilidad

In [32]:
def autoconfigure_features(data):
    vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]
    num_vars = data.select_dtypes(include=['int64', 'float64']).columns
    num_vars_na = [var for var in num_vars if var in vars_with_na]
    cat_vars = data.select_dtypes(include=['object']).columns
    cat_vars_na = [var for var in cat_vars if var in vars_with_na]

    output = {
        "num_vars": num_vars_na,
        "cat_vars": cat_vars_na,
        "vars_with_na": vars_with_na,
        "num_vars_na": num_vars_na,
        "cat_vars_na": cat_vars_na
    }
    return output


In [33]:
variables = autoconfigure_features(data_gold)

In [34]:
num_vars_na = variables['num_vars_na']
cat_vars_na = variables['cat_vars_na']
vars_with_na = variables['vars_with_na']
num_vars = variables['num_vars']
cat_vars = variables['cat_vars']


In [61]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "mse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 150, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.008, 0.02, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 500, 800),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
    }
    imputation_method_suggest = trial.suggest_categorical('imputation_method', ['median', 'mean'])
    rarelabel_tol_suggest = trial.suggest_float('rarelabel_tol', 0.0009, 0.005)
    encoding_method_suggest = trial.suggest_categorical('encoding_method', ['ordered', 'onehot'])
    scaler_method_suggest = trial.suggest_categorical('scaler_method', ['standard', 'minmax'])
    pipeline_steps = [
        ("missing_indicator", AddMissingIndicator(variables=vars_with_na)),
        ("numerical_imputation", MeanMedianImputer(imputation_method=imputation_method_suggest, variables=num_vars_na)),
        ("categorical_imputation", CategoricalImputer(imputation_method='missing', fill_value='missing', variables=cat_vars_na)),
        ("rare_label_encoder", RareLabelEncoder(tol=rarelabel_tol_suggest, variables=cat_vars, n_categories=1)),
    ]
    if encoding_method_suggest == 'ordered':
        pipeline_steps.append(("ordinal_encoder", OrdinalEncoder(variables=cat_vars, encoding_method='ordered')))
    else:
        pipeline_steps.append(("onehot_encoder", OneHotEncoder(variables=cat_vars, drop_last=True)))
    
    if scaler_method_suggest == 'standard':
        pipeline_steps.append(("scaler", SklearnTransformerWrapper(transformer=StandardScaler())))
    else:
        pipeline_steps.append(("scaler", SklearnTransformerWrapper(transformer=MinMaxScaler())))
    
    pipeline_steps.append(("model", lgb.LGBMRegressor(**params)))
    
    pipeline = Pipeline(pipeline_steps)
    pipeline.fit(X_train, y_train)
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred_test)
    return r2
    
    

In [62]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-10-23 19:38:14,352] A new study created in memory with name: no-name-c5a45880-dd4b-42ef-9ea9-92499851d058


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-23 19:38:16,910] Trial 0 finished with value: 0.5891308160626638 and parameters: {'num_leaves': 153, 'max_depth': 7, 'learning_rate': 0.01453804717479783, 'n_estimators': 627, 'min_child_samples': 28, 'subsample': 0.958737385281806, 'colsample_bytree': 0.8505294385655255, 'reg_alpha': 0.019738570956098947, 'reg_lambda': 2.598602111873286, 'imputation_method': 'median', 'rarelabel_tol': 0.002586807728988591, 'encoding_method': 'onehot', 'scaler_method': 'minmax'}. Best is trial 0 with value: 0.5891308160626638.
[I 2025-10-23 19:38:20,100] Trial 1 finished with value: 0.6942911216464291 and parameters: {'num_leaves': 193, 'max_depth': 6, 'learning_rate': 0.014825945370211823, 'n_estimators': 609, 'min_child_samples': 19, 'subsample': 0.951500620281734, 'colsample_bytree': 0.8828799092333501, 'reg_alpha': 0.047195156994219806, 'reg_lambda': 0.000443807344962488, 'imputation_method': 'median', 'rarelabel_tol': 0.0020820357340203236, 'encoding_method': 'ordered', 'scaler_method':

In [63]:
study.best_params

{'num_leaves': 243,
 'max_depth': 9,
 'learning_rate': 0.014180120795088952,
 'n_estimators': 728,
 'min_child_samples': 23,
 'subsample': 0.7228569663786978,
 'colsample_bytree': 0.7878905568279974,
 'reg_alpha': 2.5118425873931582,
 'reg_lambda': 0.00010138247656134911,
 'imputation_method': 'median',
 'rarelabel_tol': 0.0009097438239061054,
 'encoding_method': 'ordered',
 'scaler_method': 'standard'}

In [64]:
study.best_value

0.7208662480904731

In [55]:
study.best_value


0.6890609208805107

In [69]:
best_pipeline = study.best_trial.params

In [70]:
# Filtrar solo los parámetros del modelo LightGBM (quitar los de preprocesamiento)
model_param_keys = [
    'num_leaves', 'max_depth', 'learning_rate', 'n_estimators',
    'min_child_samples', 'subsample', 'colsample_bytree',
    'reg_alpha', 'reg_lambda'
]
best_model_params = {k: v for k, v in study.best_trial.params.items() if k in model_param_keys}
best_model_params


{'num_leaves': 243,
 'max_depth': 9,
 'learning_rate': 0.014180120795088952,
 'n_estimators': 728,
 'min_child_samples': 23,
 'subsample': 0.7228569663786978,
 'colsample_bytree': 0.7878905568279974,
 'reg_alpha': 2.5118425873931582,
 'reg_lambda': 0.00010138247656134911}

In [66]:
pipeline_steps = [
    ("missing_indicator", AddMissingIndicator(variables=vars_with_na)),
    ("numerical_imputation", MeanMedianImputer(imputation_method = best_pipeline['imputation_method'], variables=num_vars_na)),
    ("categorical_imputation", CategoricalImputer(imputation_method='missing', fill_value='missing', variables=cat_vars_na)),
    ("rare_label_encoder", RareLabelEncoder(tol=best_pipeline['rarelabel_tol'], variables=cat_vars, n_categories=1)),
    ("ordinal_encoder", OrdinalEncoder(variables=cat_vars, encoding_method='ordered')),
    ("scaler", SklearnTransformerWrapper(transformer=StandardScaler())),
    ("model", lgb.LGBMRegressor(**best_model_params))
]


In [67]:
final_pipeline = Pipeline(pipeline_steps)

In [68]:
final_pipeline.fit(X_train, y_train)
y_pred_train_lgb = final_pipeline.predict(X_train)
r2_lgb = r2_score(y_train, y_pred_train_lgb)
mse_lgb = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
mae_lgb = mean_absolute_error(y_train, y_pred_train_lgb)
medae_lgb = median_absolute_error(y_train, y_pred_train_lgb)
metrics_lgb_train = {
    "r2": r2_lgb,
    "mse": mse_lgb,
    "mae": mae_lgb,
    "medae": medae_lgb
}

y_pred_test_lgb = final_pipeline.predict(X_test)
r2_lgb = r2_score(y_test, y_pred_test_lgb)
mse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
mae_lgb = mean_absolute_error(y_test, y_pred_test_lgb)
medae_lgb = median_absolute_error(y_test, y_pred_test_lgb)
metrics_lgb_test = {
    "r2": r2_lgb,
    "mse": mse_lgb,
    "mae": mae_lgb,
    "medae": medae_lgb
}
metrics_lgb = {
    "train": metrics_lgb_train,
    "test": metrics_lgb_test
}
metrics_lgb

{'train': {'r2': 0.8042688901535785,
  'mse': np.float64(29404547.02567557),
  'mae': 13081317.700969419,
  'medae': 6566933.857926995},
 'test': {'r2': 0.7170856652769602,
  'mse': np.float64(35094180.87123917),
  'mae': 17081808.63120259,
  'medae': 8281661.705354728}}