## Estudio optuna

In [4]:
FEATURES = [
    'NO2 (ug/m3)', 'O3 (ug/m3)', 'PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'CO (mg/m3)', 'SO2 (ug/m3)',
    'Benceno (ug/m3)', 'Tolueno (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)',
    'temperature_2m (°C)', 'wind_speed_10m (km/h)', 'wind_gusts_10m (km/h)',
    'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'et0_fao_evapotranspiration (mm)',
    'dew_point_2m (°C)', 'rain (mm)', 'vapour_pressure_deficit (kPa)',
    'cloud_cover (%)', 'shortwave_radiation (W/m²)'
]

for col in FEATURES.copy():
    for i in range(1, 4):
        FEATURES.append(f'{col}_lag_{i}')

FEATURES += [
    'precip_autumn_last_year', 'precip_winter_last_year',
    'temp_mean_spring_summer_last', 'humidity_sum_spring_summer_last',
    'co2_mean_april_may_last', 'o3_mean_april_may_last', 
    'no2_mean_april_may_last', 'drought_days_summer_last', 
    'growing_degree_days_last'
]

FEATURES += ['polen_rolling_mean_3d', 'polen_rolling_mean_7d']

for i in range(1, 8):
    FEATURES.append(f'polen_lag_{i}')

for i in range(1, 8):
    for col in ['polen_rolling_mean_3d', 'polen_rolling_mean_7d']:
         FEATURES.append(f'{col}_lag_{i}')

for h in [1, 2, 3]:
    FEATURES += [f'temperature_forecast_t+{h}', f'rain_forecast_t+{h}', f'humidity_forecast_t+{h}']

In [5]:
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.metrics import r2_score

df = pd.read_csv(r"..\new_datasets\datos_gramineas.csv")
df['fecha'] = pd.to_datetime(df['fecha'])
df['target_diff'] = df['granos_de_polen_x_metro_cubico'].diff()
TARGET = 'target_diff'

def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'objective': 'reg:squarederror',
        'random_state': 42
    }

    # Dividir datos (usando el año 2024 como validación externa)
    df_train = df[df['fecha'].dt.year < 2024].dropna(subset=[TARGET])
    df_val = df[df['fecha'].dt.year == 2024].dropna(subset=[TARGET])
    
    X_train, y_train = df_train[FEATURES], df_train[TARGET]
    X_val, y_val = df_val[FEATURES], df_val[TARGET]

    # Entrenar modelo con los parámetros sugeridos
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)

    # Predecir y calcular R2
    preds = model.predict(X_val)
    return r2_score(y_val, preds)

In [6]:
# Crear el estudio (direction="maximize" porque queremos el R2 más alto)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print(f"Mejor R2 encontrado: {study.best_value:0.2f}")
print("Mejores parámetros:", study.best_params)

# Ver qué parámetro influyó más en el R2
optuna.visualization.plot_param_importances(study).show()

# Ver el historial de mejora
optuna.visualization.plot_optimization_history(study).show()

[I 2026-02-06 21:31:25,773] A new study created in memory with name: no-name-d9d42b0c-76c3-4b61-863b-bae09153a015
[I 2026-02-06 21:31:30,776] Trial 0 finished with value: 0.08660515281167935 and parameters: {'n_estimators': 1693, 'max_depth': 7, 'learning_rate': 0.0576708551398071, 'subsample': 0.645874753278306, 'colsample_bytree': 0.6927408052355712, 'gamma': 0.7216806167730694}. Best is trial 0 with value: 0.08660515281167935.
[I 2026-02-06 21:31:33,159] Trial 1 finished with value: 0.30538100423656656 and parameters: {'n_estimators': 741, 'max_depth': 5, 'learning_rate': 0.05016743214009948, 'subsample': 0.5778327912021376, 'colsample_bytree': 0.5242898344250857, 'gamma': 3.9902020819584716}. Best is trial 1 with value: 0.30538100423656656.
[I 2026-02-06 21:31:38,347] Trial 2 finished with value: 0.12218818999312331 and parameters: {'n_estimators': 1738, 'max_depth': 8, 'learning_rate': 0.05377491988993071, 'subsample': 0.7812395821027955, 'colsample_bytree': 0.897690251937712, 'ga

Mejor R2 encontrado: 0.31
Mejores parámetros: {'n_estimators': 741, 'max_depth': 5, 'learning_rate': 0.05016743214009948, 'subsample': 0.5778327912021376, 'colsample_bytree': 0.5242898344250857, 'gamma': 3.9902020819584716}
