In [36]:
import pandas as pd
import xgboost as xgb
import matplotlib as plt
import numpy as np
import json

In [37]:
df = pd.read_csv(r"..\new_datasets\datos_gramineas.csv")
df['fecha'] = pd.to_datetime(df['fecha'])

### Adding Biological features (last year)

In [38]:
cols_memoria = [
    'precip_autumn_last_year', 'precip_winter_last_year',
    'temp_mean_spring_summer_last', 'humidity_sum_spring_summer_last',
    'co2_mean_april_may_last', 'o3_mean_april_may_last',
    'no2_mean_april_may_last', 'drought_days_summer_last',
    'growing_degree_days_last'
]
df = df.drop(columns=[c for c in cols_memoria if c in df.columns], errors='ignore')

years = df['fecha'].dt.year.unique()
memory_features = []
    
for year in years:
    mask_year = df['fecha'].dt.year == year
    df_year = df[mask_year].copy()
        
    if len(df_year) < 260: 
        continue

    mask_am = df_year['fecha'].dt.month.isin([4, 5])
    mask_ma = df_year['fecha'].dt.month.isin([3, 4, 5, 6, 7, 8])
    mask_summer = df_year['fecha'].dt.month.isin([6, 7, 8])
    mask_autumn = df_year['fecha'].dt.month.isin([10, 11, 12])
    mask_winter = df_year['fecha'].dt.month.isin([1, 2, 3])

    precip_autumn = df_year.loc[mask_autumn, 'rain (mm)'].sum()
    precip_winter = df_year.loc[mask_winter, 'rain (mm)'].sum()
    temp_mean_ss = df_year.loc[mask_ma, 'temperature_2m (°C)'].mean()
    humidity_sum_ss = df_year.loc[mask_ma, 'relative_humidity_2m (%)'].sum()
    co2_mean_am = df_year.loc[mask_am, 'CO (mg/m3)'].mean()
    o3_mean_am = df_year.loc[mask_am, 'O3 (ug/m3)'].mean()
    no2_mean_am = df_year.loc[mask_am, 'NO2 (ug/m3)'].mean()
    drought_days = (df_year.loc[mask_summer, 'rain (mm)'] < 1.0).sum()
    mask_gdd = (df_year['fecha'].dt.month >= 2)
    gdd = (df_year.loc[mask_gdd, 'temperature_2m (°C)'] - 5).clip(lower=0).sum()

    memory_features.append({
        'year_target': year + 1,
        'precip_autumn_last_year': precip_autumn,
        'precip_winter_last_year': precip_winter,
        'temp_mean_spring_summer_last': temp_mean_ss,
        'humidity_sum_spring_summer_last': humidity_sum_ss,
        'co2_mean_april_may_last': co2_mean_am,
        'o3_mean_april_may_last': o3_mean_am,
        'no2_mean_april_may_last': no2_mean_am,
        'drought_days_summer_last': drought_days,
        'growing_degree_days_last': gdd
    })
    
df_memory = pd.DataFrame(memory_features)
df['year'] = df['fecha'].dt.year
df = pd.merge(df, df_memory, left_on='year', right_on='year_target', how='left')
df.drop(columns=['year_target', 'year'], inplace=True)
df

Unnamed: 0,fecha,granos_de_polen_x_metro_cubico,temperature_2m (°C),wind_speed_10m (km/h),wind_gusts_10m (km/h),relative_humidity_2m (%),wind_direction_10m (°),et0_fao_evapotranspiration (mm),dew_point_2m (°C),rain (mm),...,Tolueno (ug/m3),precip_autumn_last_year,precip_winter_last_year,temp_mean_spring_summer_last,humidity_sum_spring_summer_last,co2_mean_april_may_last,o3_mean_april_may_last,no2_mean_april_may_last,drought_days_summer_last,growing_degree_days_last
0,2014-01-01,0.0,1.066667,12.283333,22.187500,89.375000,173.541667,0.011250,-0.525000,0.000000,...,0.8,,,,,,,,,
1,2014-01-02,0.0,3.341667,15.962500,28.450000,85.958333,171.000000,0.015833,1.170833,0.000000,...,1.6,,,,,,,,,
2,2014-01-03,1.0,5.587500,14.808333,27.833333,87.875000,195.541667,0.018333,3.683333,0.000000,...,2.4,,,,,,,,,
3,2014-01-04,0.0,6.216667,15.016667,29.950000,85.125000,171.166667,0.020417,3.904167,0.045833,...,0.9,,,,,,,,,
4,2014-01-05,0.0,4.987500,12.566667,23.804167,91.958333,227.375000,0.009583,3.770833,0.008333,...,1.1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4432,2026-02-19,,7.327084,14.188892,34.439999,57.949936,283.524689,0.099310,-0.777083,0.000000,...,2.0,4.620833,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2425.966667
4433,2026-02-20,,6.960417,3.602944,10.770000,69.889557,185.878281,0.083127,1.520833,0.000000,...,4.3,4.620833,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2425.966667
4434,2026-02-21,,8.606522,4.654393,12.881739,66.904915,121.693459,0.095542,2.306522,0.000000,...,2.8,4.620833,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2425.966667
4435,2026-02-22,,10.452583,3.787788,9.480000,54.416668,154.398895,0.101121,0.396723,0.000000,...,,4.620833,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2425.966667


### Polen

In [39]:
# 1. Definimos los lags que queremos usar
lags_seleccionados = [1, 2, 3, 7, 14]

# 2. Medias móviles base
df['polen_rolling_mean_3d'] = df['granos_de_polen_x_metro_cubico'].rolling(window=3).mean()
df['polen_rolling_mean_7d'] = df['granos_de_polen_x_metro_cubico'].rolling(window=7).mean()

# 3. Lags de polen y medias móviles
for i in lags_seleccionados:
    df[f'polen_lag_{i}'] = df['granos_de_polen_x_metro_cubico'].shift(i)
    df[f'polen_rolling_mean_3d_lag_{i}'] = df['polen_rolling_mean_3d'].shift(i)
    df[f'polen_rolling_mean_7d_lag_{i}'] = df['polen_rolling_mean_7d'].shift(i)

# 4. Tendencia y Aceleración
df['polen_trend_3d'] = df['polen_lag_1'] - df['polen_lag_3']
df['polen_accel'] = (df['polen_lag_1'] - df['polen_lag_2']) - (df['polen_lag_2'] - df['polen_lag_3'])

### Lag Contaminantes y Meteo (3 días)

In [40]:
cols = [
    'NO2 (ug/m3)', 'O3 (ug/m3)', 'PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'CO (mg/m3)',
    'Benceno (ug/m3)', 'Tolueno (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)',
    'temperature_2m (°C)', 'wind_speed_10m (km/h)', 'wind_gusts_10m (km/h)',
    'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'et0_fao_evapotranspiration (mm)',
    'dew_point_2m (°C)', 'rain (mm)', 'vapour_pressure_deficit (kPa)',
    'cloud_cover (%)', 'shortwave_radiation (W/m²)', "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"
]

for col in cols:
    if col in df.columns:
        for i in range(1, 4):
            df[f'{col}_lag_{i}'] = df[col].shift(i)
        df[f'{col}_lag_{7}'] = df[col].shift(7)
        df[f'{col}_lag_{14}'] = df[col].shift(14)

  df[f'{col}_lag_{7}'] = df[col].shift(7)
  df[f'{col}_lag_{14}'] = df[col].shift(14)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{7}'] = df[col].shift(7)
  df[f'{col}_lag_{14}'] = df[col].shift(14)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{7}'] = df[col].shift(7)
  df[f'{col}_lag_{14}'] = df[col].shift(14)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{7}'] = df[col].shift(7)
  df[f'{col}_lag_{14}'] = df[col].shift(14)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{i}'] = df[col].shift(i)
  df[f'{col}_lag_{7}'] = df[col].shift(7)
  df[f'{col}_lag_{14}'] = df[col].shift(14)


### Guardar dataframe

In [41]:
df.to_csv(r"..\new_datasets\datos_gramineas.csv", index=False)

### Add Features

In [42]:
METEO_FEATURES = [
    'NO2 (ug/m3)', 'O3 (ug/m3)', 'PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'CO (mg/m3)', 'SO2 (ug/m3)',
    'Benceno (ug/m3)', 'Tolueno (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)',
    'temperature_2m (°C)', 'wind_speed_10m (km/h)', 'wind_gusts_10m (km/h)',
    'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'et0_fao_evapotranspiration (mm)',
    'dew_point_2m (°C)', 'rain (mm)', 'vapour_pressure_deficit (kPa)',
    'cloud_cover (%)', 'shortwave_radiation (W/m²)'
]

FEATURES = METEO_FEATURES.copy()

for col in METEO_FEATURES:
    for i in [1, 2, 3, 7, 14]:
        FEATURES.append(f'{col}_lag_{i}')

FEATURES += [
    'precip_autumn_last_year', 'precip_winter_last_year',
    'temp_mean_spring_summer_last', 'humidity_sum_spring_summer_last',
    'co2_mean_april_may_last', 'o3_mean_april_may_last', 
    'no2_mean_april_may_last', 'drought_days_summer_last', 
    'growing_degree_days_last'
]

FEATURES += ['polen_rolling_mean_3d', 'polen_rolling_mean_7d', 'polen_trend_3d', 'polen_accel']

for i in [1, 2, 3, 7, 14]:
    FEATURES.append(f'polen_lag_{i}')
    FEATURES.append(f'polen_rolling_mean_3d_lag_{i}')
    FEATURES.append(f'polen_rolling_mean_7d_lag_{i}')

for h in [1, 2, 3]:
    FEATURES += [f'temperature_forecast_t+{h}', f'rain_forecast_t+{h}', f'humidity_forecast_t+{h}']

# Asegurar no repetidos y guardar features en JSON
FEATURES = list(dict.fromkeys(FEATURES))
FEATURES = [f for f in FEATURES if f in df.columns]

with open('json/features_t+1.json', 'w', encoding='utf-8') as f:
    json.dump(FEATURES, f, ensure_ascii=False, indent=4)