In [1]:
import pandas as pd
import xgboost as xgb

In [2]:
df = pd.read_csv(r"..\new_datasets\datos_gramineas.csv")
df['fecha'] = pd.to_datetime(df['fecha'])

### Adding Biological features (last year)

In [3]:
years = df['fecha'].dt.year.unique()
memory_features = []
    
for year in years:
    # Datos de este año
    mask_year = df['fecha'].dt.year == year
    df_year = df[mask_year].copy()
        
    if len(df_year) < 260: 
        continue

    mask_am = df_year['fecha'].dt.month.isin([4, 5])
    mask_ma = df_year['fecha'].dt.month.isin([3, 4, 5, 6, 7, 8])
    mask_summer = df_year['fecha'].dt.month.isin([6, 7, 8])
    mask_autumn = df_year['fecha'].dt.month.isin([10, 11, 12])
    mask_winter = df_year['fecha'].dt.month.isin([1, 2, 3])

    precip_autumn = df_year.loc[mask_autumn, 'rain (mm)'].sum()
    precip_winter = df_year.loc[mask_winter, 'rain (mm)'].sum()
    temp_mean_ss = df_year.loc[mask_ma, 'temperature_2m (°C)'].mean()
    humidity_sum_ss = df_year.loc[mask_ma, 'relative_humidity_2m (%)'].sum()
    co2_mean_am = df_year.loc[mask_am, 'CO (mg/m3)'].mean()
    o3_mean_am = df_year.loc[mask_am, 'O3 (ug/m3)'].mean()
    no2_mean_am = df_year.loc[mask_am, 'NO2 (ug/m3)'].mean()
    drought_days = (df_year.loc[mask_summer, 'rain (mm)'] < 1.0).sum()
    mask_gdd = (df_year['fecha'].dt.month >= 2)
    gdd = (df_year.loc[mask_gdd, 'temperature_2m (°C)'] - 5).clip(lower=0).sum()

    memory_features.append({
        'year_target': year + 1,
        'precip_autumn_last_year': precip_autumn,
        'precip_winter_last_year': precip_winter,
        'temp_mean_spring_summer_last': temp_mean_ss,
        'humidity_sum_spring_summer_last': humidity_sum_ss,
        'co2_mean_april_may_last': co2_mean_am,
        'o3_mean_april_may_last': o3_mean_am,
        'no2_mean_april_may_last': no2_mean_am,
        'drought_days_summer_last': drought_days,
        'growing_degree_days_last': gdd
    })
    
df_memory = pd.DataFrame(memory_features)
df['year'] = df['fecha'].dt.year
df = pd.merge(df, df_memory, left_on='year', right_on='year_target', how='left')
df.drop(columns=['year_target', 'year'], inplace=True)
df

Unnamed: 0.1,fecha,Unnamed: 0,tipo_polinico,granos_de_polen_x_metro_cubico,año,temperature_2m (°C),wind_speed_10m (km/h),wind_gusts_10m (km/h),relative_humidity_2m (%),wind_direction_10m (°),...,Tolueno (ug/m3),precip_autumn_last_year,precip_winter_last_year,temp_mean_spring_summer_last,humidity_sum_spring_summer_last,co2_mean_april_may_last,o3_mean_april_may_last,no2_mean_april_may_last,drought_days_summer_last,growing_degree_days_last
0,2026-02-07,,,,,5.150250,10.481970,26.504999,90.625000,214.951340,...,,3.775,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2399.088
1,2026-02-06,,,,,8.154407,13.668339,35.962498,82.572914,231.659683,...,,3.775,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2399.088
2,2026-02-05,,,,,9.379416,8.606559,25.440001,94.833336,228.125656,...,,3.775,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2399.088
3,2026-02-04,,,,,5.929406,8.294147,23.977499,90.583336,217.392654,...,,3.775,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2399.088
4,2026-02-03,,,,,5.763000,10.280394,30.930000,81.708336,232.840408,...,,3.775,3.258333,15.009851,12098.625,0.231034,68.896552,21.068966,90.0,2399.088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4380,2014-01-05,4.0,Gramíneas,0.0,2014.0,4.987500,12.566667,23.804167,91.958333,227.375000,...,1.1,,,,,,,,,
4381,2014-01-04,3.0,Gramíneas,1.0,2014.0,6.216667,15.016667,29.950000,85.125000,171.166667,...,0.9,,,,,,,,,
4382,2014-01-03,2.0,Gramíneas,0.0,2014.0,5.587500,14.808333,27.833333,87.875000,195.541667,...,2.4,,,,,,,,,
4383,2014-01-02,1.0,Gramíneas,0.0,2014.0,3.341667,15.962500,28.450000,85.958333,171.000000,...,1.6,,,,,,,,,


### Polen

In [4]:
# Medias últimos días
df['polen_rolling_mean_3d'] = df['granos_de_polen_x_metro_cubico'].rolling(window=3, min_periods=1).mean()
df['polen_rolling_mean_7d'] = df['granos_de_polen_x_metro_cubico'].rolling(window=7, min_periods=1).mean()

# Lags de polen (1-7 días)
for i in range(1, 8):
    df[f'polen_lag_{i}'] = df['granos_de_polen_x_metro_cubico'].shift(i)

# Lags de medias móviles (1-7 días)
for i in range(1, 8):
    for col in ['polen_rolling_mean_3d', 'polen_rolling_mean_7d']:
         df[f'{col}_lag_{i}'] = df[col].shift(i)

### Lag Contaminantes y Meteo (3 días)

In [5]:
cols = [
    'NO2 (ug/m3)', 'O3 (ug/m3)', 'PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'CO (mg/m3)', 'SO2 (ug/m3)',
    'Benceno (ug/m3)', 'Tolueno (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)',
    'temperature_2m (°C)', 'wind_speed_10m (km/h)', 'wind_gusts_10m (km/h)',
    'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'et0_fao_evapotranspiration (mm)',
    'dew_point_2m (°C)', 'rain (mm)', 'vapour_pressure_deficit (kPa)',
    'cloud_cover (%)', 'shortwave_radiation (W/m²)'
]

for col in cols:
    if col in df.columns:
        for i in range(1, 4):
            df[f'{col}_lag_{i}'] = df[col].shift(i)

### Predicción Meteo (3 días)

In [6]:
for h in [1, 2, 3]:
    df[f'temperature_forecast_t+{h}'] = df['temperature_2m (°C)'].shift(-h)
    df[f'rain_forecast_t+{h}'] = df['rain (mm)'].shift(-h)
    df[f'humidity_forecast_t+{h}'] = df['relative_humidity_2m (%)'].shift(-h)

### Guardar dataframe

In [7]:
df.to_csv(r"..\new_datasets\datos_gramineas.csv", index=False)

### Train Model (t+1)

In [8]:
FEATURES = [
    'NO2 (ug/m3)', 'O3 (ug/m3)', 'PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'CO (mg/m3)', 'SO2 (ug/m3)',
    'Benceno (ug/m3)', 'Tolueno (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)',
    'temperature_2m (°C)', 'wind_speed_10m (km/h)', 'wind_gusts_10m (km/h)',
    'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'et0_fao_evapotranspiration (mm)',
    'dew_point_2m (°C)', 'rain (mm)', 'vapour_pressure_deficit (kPa)',
    'cloud_cover (%)', 'shortwave_radiation (W/m²)'
]

for col in FEATURES.copy():
    for i in range(1, 4):
        FEATURES.append(f'{col}_lag_{i}')

FEATURES += [
    'precip_autumn_last_year', 'precip_winter_last_year',
    'temp_mean_spring_summer_last', 'humidity_sum_spring_summer_last',
    'co2_mean_april_may_last', 'o3_mean_april_may_last', 
    'no2_mean_april_may_last', 'drought_days_summer_last', 
    'growing_degree_days_last'
]

FEATURES += ['polen_rolling_mean_3d', 'polen_rolling_mean_7d']

for i in range(1, 8):
    FEATURES.append(f'polen_lag_{i}')

for i in range(1, 8):
    for col in ['polen_rolling_mean_3d', 'polen_rolling_mean_7d']:
         FEATURES.append(f'{col}_lag_{i}')

for h in [1, 2, 3]:
    FEATURES += [f'temperature_forecast_t+{h}', f'rain_forecast_t+{h}', f'humidity_forecast_t+{h}']

In [9]:
TARGET = 'granos_de_polen_x_metro_cubico'

# Eliminar filas nulas
df = df.dropna(subset=[TARGET])

X_train = df[FEATURES]
y_train = df[TARGET]

reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                        n_estimators=1000,
                        objective='reg:squarederror',
                        max_depth=3,
                        learning_rate=0.01)

reg.fit(X_train, y_train, verbose=100)

reg.get_booster().save_model('modelo_t+1.json')