In [10]:
import pandas as pd
import requests

import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import date

### Get main CSV

In [11]:
CSV_POLEN = r"..\new_datasets\datos_gramineas.csv"
df_master = pd.read_csv(CSV_POLEN)
df_master['fecha'] = pd.to_datetime(df_master['fecha'])

### Get new Polen

In [12]:
URL_POLEN = "https://datos.comunidad.madrid/catalogo/dataset/e608aace-3593-43a3-8c91-02332137fa83/resource/db5e3952-57f5-40f3-bb1d-906eb17aebb1/download/mediciones_polen.json"

response = requests.get(URL_POLEN)
nuevos_datos = response.json()
df_nuevos = pd.DataFrame(nuevos_datos['data'] if 'data' in nuevos_datos else nuevos_datos)

df_nuevos = df_nuevos[df_nuevos['tipo_polinico'] == 'Gramíneas']
df_nuevos = df_nuevos[df_nuevos['captador'] == 'AYTM']
df_nuevos['fecha'] = pd.to_datetime(df_nuevos['fecha_lectura'])
df_polen = df_nuevos[['fecha', 'granos_de_polen_x_metro_cubico']].rename(columns={'granos_de_polen_x_metro_cubico': 'granos_de_polen_x_metro_cubico'})

### Get new Meteo

In [13]:
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

URL_METEO = "https://archive-api.open-meteo.com/v1/archive"
today = date.today().strftime('%Y-%m-%d')
params = {
	"latitude": 40.4165,
	"longitude": -3.7026,
	"hourly": ["temperature_2m", "wind_speed_10m", "wind_gusts_10m", "relative_humidity_2m", "wind_direction_10m", "dew_point_2m", "rain", "vapour_pressure_deficit", 
            "et0_fao_evapotranspiration", "cloud_cover", "shortwave_radiation", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
	"start_date": "2026-01-01",
    "end_date": today,
    "timezone": "Europe/Madrid"
}
responses = openmeteo.weather_api(URL_METEO, params=params)

response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(1).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(2).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(3).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(4).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(5).ValuesAsNumpy()
hourly_rain = hourly.Variables(6).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(7).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(8).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(9).ValuesAsNumpy()
hourly_shortwave_radiation = hourly.Variables(10).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(11).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(12).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m (°C)"] = hourly_temperature_2m
hourly_data["wind_speed_10m (km/h)"] = hourly_wind_speed_10m
hourly_data["wind_gusts_10m (km/h)"] = hourly_wind_gusts_10m
hourly_data["relative_humidity_2m (%)"] = hourly_relative_humidity_2m
hourly_data["wind_direction_10m (°)"] = hourly_wind_direction_10m
hourly_data["dew_point_2m (°C)"] = hourly_dew_point_2m
hourly_data["rain (mm)"] = hourly_rain
hourly_data["vapour_pressure_deficit (kPa)"] = hourly_vapour_pressure_deficit
hourly_data["et0_fao_evapotranspiration (mm)"] = hourly_et0_fao_evapotranspiration
hourly_data["cloud_cover (%)"] = hourly_cloud_cover
hourly_data["shortwave_radiation (W/m²)"] = hourly_shortwave_radiation
hourly_data["soil_temperature_0_to_7cm (°C)"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_moisture_0_to_7cm (m³/m³)"] = hourly_soil_moisture_0_to_7cm

hourly_dataframe = pd.DataFrame(data = hourly_data)

hourly_dataframe['fecha'] = pd.to_datetime(hourly_dataframe['date']).dt.date
df_meteo = hourly_dataframe.groupby('fecha').mean(numeric_only=True).reset_index()
df_meteo['fecha'] = pd.to_datetime(df_meteo['fecha'])
df_meteo

Coordinates: 40.38664245605469°N -3.67608642578125°E
Elevation: 651.0 m asl
Timezone difference to GMT+0: 3600s


Unnamed: 0,fecha,temperature_2m (°C),wind_speed_10m (km/h),wind_gusts_10m (km/h),relative_humidity_2m (%),wind_direction_10m (°),dew_point_2m (°C),rain (mm),vapour_pressure_deficit (kPa),et0_fao_evapotranspiration (mm),cloud_cover (%),shortwave_radiation (W/m²),soil_temperature_0_to_7cm (°C),soil_moisture_0_to_7cm (m³/m³)
0,2025-12-31,1.5,0.18,2.16,89.752777,360.0,0.0,0.0,0.069987,0.0,100.0,0.0,2.35,0.159
1,2026-01-01,1.189583,3.85612,9.75,91.673927,104.047821,-0.029167,0.004167,0.05694,0.020978,98.375,57.833332,1.060417,0.160167
2,2026-01-02,6.254167,6.514946,14.864999,90.102448,67.777176,4.725,0.029167,0.099137,0.025885,99.666664,53.125,5.689583,0.179125
3,2026-01-03,8.379167,7.491186,17.309999,86.490654,88.539391,6.1875,0.008333,0.160529,0.032128,95.791664,63.75,7.68125,0.201292
4,2026-01-04,7.7,7.171152,18.105,86.508392,49.319302,5.50625,0.3,0.153167,0.03811,82.041664,79.083336,8.1125,0.218958
5,2026-01-05,2.0625,10.650216,26.459999,68.765984,30.456289,-3.308333,0.1125,0.229543,0.047529,41.916668,99.625,4.233333,0.238542
6,2026-01-06,0.308333,4.902383,12.87,62.952877,168.258469,-6.45,0.0,0.257698,0.044548,9.916667,109.083336,0.935417,0.219167
7,2026-01-07,2.10625,7.426102,16.934999,67.897377,255.664413,-3.35,0.0,0.233559,0.037819,38.666668,74.125,1.7125,0.207708
8,2026-01-08,2.910417,10.741467,25.289999,93.418068,236.850037,1.935417,0.0,0.049081,0.015798,78.958336,41.875,2.889583,0.196583
9,2026-01-09,8.04375,16.627234,39.149998,73.536575,259.970612,2.989583,0.004167,0.313644,0.066542,66.625,101.541664,7.260417,0.183875


### Get new Contaminantes

In [14]:
URL_CONTAMINANTES = "https://datos.madrid.es/egob/catalogo/300755-12751586-calidad-aire-tiempo-real-acumula.json"

response = requests.get(URL_CONTAMINANTES)
nuevos_datos = response.json()
df_nuevos = pd.DataFrame(nuevos_datos['data'] if 'data' in nuevos_datos else nuevos_datos)
df_nuevos = pd.DataFrame(df_nuevos['records'].tolist())

df_nuevos_cont = df_nuevos[df_nuevos['ESTACION'] == "8"].copy()

magnitude_map = {
    '1': 'SO2 (ug/m3)', '6': 'CO (mg/m3)', '7': 'NO (ug/m3)', '8': 'NO2 (ug/m3)',
    '9': 'PM2.5 (ug/m3)', '10': 'PM10 (ug/m3)', '12': 'NOx (ug/m3)', '14': 'O3 (ug/m3)',
    '20': 'Tolueno (ug/m3)', '30': 'Benceno (ug/m3)', '42': 'HCT (mg/m3)', '44': 'HCNM (mg/m3)'
}

d_cols = [f'H{i:02d}' for i in range(1, 25)]
v_cols = [f'V{i:02d}' for i in range(1, 25)]

rows = []
for idx, row in df_nuevos_cont.iterrows():
    for d, v in zip(d_cols, v_cols):
        if v in row and row[v] == 'V':
            val = str(row[d]).replace(',', '.')
            hour_num = int(d[1:])
            rows.append({
                'ano': row['ANO'], 
                'mes': row['MES'], 
                'dia': row['DIA'],
                'hora': hour_num,
                'magnitud': row['MAGNITUD'],
                'valor': pd.to_numeric(val, errors='coerce')
            })

# Nueva columna fecha
df_melted = pd.DataFrame(rows)
df_melted['fecha'] = pd.to_datetime(df_melted[['ano', 'mes', 'dia']].rename(
    columns={'ano': 'year', 'mes': 'month', 'dia': 'day'}), errors='coerce')
df_melted = df_melted.dropna(subset=['fecha'])

# Hacer media
df_diario = df_melted.groupby(['ano', 'mes', 'dia', 'magnitud'], as_index=False)['valor'].mean()
df_diario['fecha'] = pd.to_datetime(df_diario[['ano', 'mes', 'dia']].rename(
    columns={'ano': 'year', 'mes': 'month', 'dia': 'day'}
)).dt.date

# Reformatear tabla
df_diario['magnitud'] = df_diario['magnitud'].astype(str).map(magnitude_map)
df_nuevos_cont = df_diario.pivot_table(
    index='fecha', 
    columns='magnitud', 
    values='valor', 
    aggfunc='mean'
).reset_index()
df_nuevos_cont = df_nuevos_cont.round(1)
df_nuevos_cont.columns.name = None
df_nuevos_cont['fecha'] = pd.to_datetime(df_nuevos_cont['fecha'])

### MIX

In [15]:
df_updates = pd.merge(df_polen, df_meteo, on='fecha', how='outer')
df_updates = pd.merge(df_updates, df_nuevos_cont, on='fecha', how='outer')
df_final = pd.concat([df_master, df_updates], ignore_index=True)
df_final = df_final.groupby('fecha', as_index=False).first()

df_final = df_final.sort_values('fecha', ascending=True)

# Solucionar desfase
df_final['granos_de_polen_x_metro_cubico'] = df_final['granos_de_polen_x_metro_cubico'].shift(-1)

### Meteo and contaminantes Predictions

In [16]:
URL_METEO_FORECAST = "https://api.open-meteo.com/v1/forecast"

params_meteo = {
    "latitude": 40.4165,
    "longitude": -3.7026,
    "hourly": [
        "temperature_2m",
        "wind_speed_10m",
        "wind_gusts_10m",
        "relative_humidity_2m",
        "wind_direction_10m",
        "dew_point_2m",
        "rain",
        "vapour_pressure_deficit",
        "et0_fao_evapotranspiration",
        "cloud_cover",
        "shortwave_radiation",
        "soil_temperature_0_to_7cm",
        "soil_moisture_0_to_7cm"
    ],
    "forecast_days": 3,
    "timezone": "Europe/Madrid"
}
responses = openmeteo.weather_api(URL_METEO_FORECAST, params=params_meteo)

response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(1).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(2).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(3).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(4).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(5).ValuesAsNumpy()
hourly_rain = hourly.Variables(6).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(7).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(8).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(9).ValuesAsNumpy()
hourly_shortwave_radiation = hourly.Variables(10).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(11).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(12).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m (°C)"] = hourly_temperature_2m
hourly_data["wind_speed_10m (km/h)"] = hourly_wind_speed_10m
hourly_data["wind_gusts_10m (km/h)"] = hourly_wind_gusts_10m
hourly_data["relative_humidity_2m (%)"] = hourly_relative_humidity_2m
hourly_data["wind_direction_10m (°)"] = hourly_wind_direction_10m
hourly_data["dew_point_2m (°C)"] = hourly_dew_point_2m
hourly_data["rain (mm)"] = hourly_rain
hourly_data["vapour_pressure_deficit (kPa)"] = hourly_vapour_pressure_deficit
hourly_data["et0_fao_evapotranspiration (mm)"] = hourly_et0_fao_evapotranspiration
hourly_data["cloud_cover (%)"] = hourly_cloud_cover
hourly_data["shortwave_radiation (W/m²)"] = hourly_shortwave_radiation
hourly_data["soil_temperature_0_to_7cm (°C)"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_moisture_0_to_7cm (m³/m³)"] = hourly_soil_moisture_0_to_7cm

hourly_dataframe = pd.DataFrame(data = hourly_data)

hourly_dataframe['fecha'] = pd.to_datetime(hourly_dataframe['date']).dt.date
df_meteo_pred = hourly_dataframe.groupby('fecha').mean(numeric_only=True).reset_index()
df_meteo_pred['fecha'] = pd.to_datetime(df_meteo_pred['fecha'])
df_meteo_pred

Coordinates: 40.4375°N -3.6875°E
Elevation: 651.0 m asl
Timezone difference to GMT+0: 3600s


Unnamed: 0,fecha,temperature_2m (°C),wind_speed_10m (km/h),wind_gusts_10m (km/h),relative_humidity_2m (%),wind_direction_10m (°),dew_point_2m (°C),rain (mm),vapour_pressure_deficit (kPa),et0_fao_evapotranspiration (mm),cloud_cover (%),shortwave_radiation (W/m²),soil_temperature_0_to_7cm (°C),soil_moisture_0_to_7cm (m³/m³)
0,2026-02-20,6.363,4.379589,6.48,67.0,9.462248,0.680173,0.0,0.317166,0.001809,0.0,0.0,,
1,2026-02-21,9.579667,3.939961,11.474999,55.833332,133.61171,-0.322308,0.0,0.688178,0.100303,3.0,184.125,,
2,2026-02-22,10.452583,3.787788,9.48,54.416668,154.398895,0.396723,0.0,0.727659,0.101121,37.833332,183.208328,,
3,2026-02-23,11.149957,3.587432,10.29913,56.47826,131.682983,1.635345,0.0,0.729019,0.108953,10.521739,196.82608,,


In [17]:
url = "https://air-quality-api.open-meteo.com/v1/air-quality"
params = {
	"latitude": 40.4165,
	"longitude": -3.7026,
	"hourly": ["pm10", "pm2_5", "carbon_monoxide", "nitrogen_dioxide", "sulphur_dioxide", "ozone"],
	"forecast_days": 3,
}
responses = openmeteo.weather_api(url, params=params)

response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

hourly = response.Hourly()
hourly_pm10 = hourly.Variables(0).ValuesAsNumpy()
hourly_pm2_5 = hourly.Variables(1).ValuesAsNumpy()
hourly_carbon_monoxide = hourly.Variables(2).ValuesAsNumpy()
hourly_nitrogen_dioxide = hourly.Variables(3).ValuesAsNumpy()
hourly_sulphur_dioxide = hourly.Variables(4).ValuesAsNumpy()
hourly_ozone = hourly.Variables(5).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["PM10 (ug/m3)"] = hourly_pm10
hourly_data["PM2.5 (ug/m3)"] = hourly_pm2_5
hourly_data["CO (mg/m3)"] = hourly_carbon_monoxide/1000  # Convertir de µg/m3 a mg/m3
hourly_data["NO2 (ug/m3)"] = hourly_nitrogen_dioxide
hourly_data["SO2 (ug/m3)"] = hourly_sulphur_dioxide
hourly_data["O3 (ug/m3)"] = hourly_ozone

hourly_dataframe = pd.DataFrame(data = hourly_data)
print("\nHourly data\n", hourly_dataframe)

hourly_dataframe['fecha'] = pd.to_datetime(hourly_dataframe['date']).dt.date
df_contaminantes_pred = hourly_dataframe.groupby('fecha').mean(numeric_only=True).reset_index()
df_contaminantes_pred['fecha'] = pd.to_datetime(df_contaminantes_pred['fecha'])
df_contaminantes_pred = df_contaminantes_pred.round(1)

Coordinates: 40.400001525878906°N -3.6999988555908203°E
Elevation: 651.0 m asl
Timezone difference to GMT+0: 0s

Hourly data
                         date  PM10 (ug/m3)  PM2.5 (ug/m3)  CO (mg/m3)  \
0  2026-02-21 00:00:00+00:00     29.100000      26.600000       0.248   
1  2026-02-21 01:00:00+00:00     21.500000      17.700001       0.228   
2  2026-02-21 02:00:00+00:00     14.000000      11.500000       0.176   
3  2026-02-21 03:00:00+00:00     12.100000       8.700000       0.159   
4  2026-02-21 04:00:00+00:00     11.200000       8.400000       0.146   
..                       ...           ...            ...         ...   
67 2026-02-23 19:00:00+00:00     36.000000      30.200001       0.308   
68 2026-02-23 20:00:00+00:00     42.299999      36.500000       0.340   
69 2026-02-23 21:00:00+00:00     44.900002      38.400002       0.328   
70 2026-02-23 22:00:00+00:00     41.900002      36.500000       0.311   
71 2026-02-23 23:00:00+00:00     36.599998      33.099998       0.282  

In [18]:
df_pred_completo = pd.merge(df_meteo_pred, df_contaminantes_pred, on='fecha', how='inner')
fechas_historico = df_final['fecha'].unique()
df_pred_filtrado = df_pred_completo[~df_pred_completo['fecha'].isin(fechas_historico)]
df_final_with_predictions = pd.concat([df_final, df_pred_filtrado], axis=0, ignore_index=True)
df_final_with_predictions.to_csv(r"..\new_datasets\datos_gramineas.csv", index=False)