In [1]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
aemet_data_path = Path("../data/raw/aemet/")

In [3]:
aemet_data = pd.read_csv(aemet_data_path / "aemet_daily_data.csv")

In [4]:
aemet_data.head()

Unnamed: 0,fecha,indicativo,nombre,provincia,altitud,tmed,prec,tmin,horatmin,tmax,...,hrMedia,presMax,horaPresMax,presMin,horaPresMin,hrMax,horaHrMax,hrMin,horaHrMin,sol
0,1998-01-01,9394X,CALATAYUD,ZARAGOZA,582,6.5,0,2.8,07:30,10.2,...,,,,,,,,,,
1,1998-01-01,B434X,PORTOCOLOM,ILLES BALEARS,17,13.0,6,10.3,02:40,15.8,...,90.0,,,,,,,,,
2,1998-01-01,8293X,XÀTIVA,VALENCIA,88,9.1,53,7.4,08:40,10.8,...,,,,,,,,,,
3,1998-01-01,2400E,AUTILLA DEL PINO,PALENCIA,874,0.6,0,-2.9,06:30,4.2,...,87.0,916.8,Varias,911.9,0.0,98.0,07:00,77.0,Varias,
4,1998-01-01,2755X,BENAVENTE,ZAMORA,715,3.0,0,-2.3,07:50,8.2,...,66.0,,,,,88.0,08:25,45.0,Varias,


In [5]:
columns_mapping = {
    "fecha": "date",
    "indicativo": "station_id",
    "nombre": "station_name",
    "provincia": "province",
    "tmed": "temperature_average",
    "tmin": "temperature_min",
    "tmax": "temperature_max",
    "prec": "precipitation",
    "velmedia": "wind_speed_average",
    "sol": "solar_radiation",
    "presMax": "pressure_max",
    "presMin": "pressure_min",
    "hrMedia": "humidity_average",
    "hrMax": "humidity_max",
    "hrMin": "humidity_min",
}

In [6]:
numeric_columns = [
    "temperature_average",
    "temperature_min",
    "temperature_max",
    "precipitation",
    "wind_speed_average",
    "solar_radiation",
    "pressure_max",
    "pressure_min",
    "humidity_average",
    "humidity_max",
    "humidity_min",
]

In [7]:
columns_to_keep = list(columns_mapping.keys())

In [8]:
aemet_data = aemet_data[columns_to_keep]

In [9]:
aemet_data = aemet_data.rename(columns=columns_mapping)

In [10]:
def convert_to_float(series: pd.Series) -> pd.Series:
    return pd.to_numeric(
        series.replace({"Ip": "0.0", "Acum": np.nan}).str.replace(",", ".")
    )

In [11]:
aemet_data["date"] = pd.to_datetime(aemet_data["date"])

In [12]:
for column in numeric_columns:
    if aemet_data[column].dtype == "object":
        aemet_data[column] = convert_to_float(aemet_data[column])

In [13]:
aemet_data[numeric_columns] = aemet_data[numeric_columns].astype("float32")

In [14]:
aemet_data.head()

Unnamed: 0,date,station_id,station_name,province,temperature_average,temperature_min,temperature_max,precipitation,wind_speed_average,solar_radiation,pressure_max,pressure_min,humidity_average,humidity_max,humidity_min
0,1998-01-01,9394X,CALATAYUD,ZARAGOZA,6.5,2.8,10.2,0.0,1.4,,,,,,
1,1998-01-01,B434X,PORTOCOLOM,ILLES BALEARS,13.0,10.3,15.8,0.6,6.4,,,,90.0,,
2,1998-01-01,8293X,XÀTIVA,VALENCIA,9.1,7.4,10.8,5.3,1.1,,,,,,
3,1998-01-01,2400E,AUTILLA DEL PINO,PALENCIA,0.6,-2.9,4.2,0.0,2.2,,916.799988,911.900024,87.0,98.0,77.0
4,1998-01-01,2755X,BENAVENTE,ZAMORA,3.0,-2.3,8.2,0.0,0.8,,,,66.0,88.0,45.0


In [15]:
aemet_data.dtypes

date                   datetime64[ns]
station_id                     object
station_name                   object
province                       object
temperature_average           float32
temperature_min               float32
temperature_max               float32
precipitation                 float32
wind_speed_average            float32
solar_radiation               float32
pressure_max                  float32
pressure_min                  float32
humidity_average              float32
humidity_max                  float32
humidity_min                  float32
dtype: object

In [16]:
aemet_data.isna().sum() / len(aemet_data)

date                   0.000000
station_id             0.000000
station_name           0.000000
province               0.000000
temperature_average    0.031719
temperature_min        0.031364
temperature_max        0.031176
precipitation          0.034826
wind_speed_average     0.223662
solar_radiation        0.800896
pressure_max           0.725008
pressure_min           0.725013
humidity_average       0.132318
humidity_max           0.201242
humidity_min           0.201181
dtype: float64

In [17]:
province_name_mapping = {
    "A CORUÑA": "A Coruña",
    "ALBACETE": "Albacete",
    "ALICANTE": "Alacant/Alicante",
    "ALMERIA": "Almería",
    "ARABA/ALAVA": "Araba/Álava",
    "ASTURIAS": "Asturias",
    "AVILA": "Ávila",
    "BADAJOZ": "Badajoz",
    "BALEARES": "Illes Balears",
    "BARCELONA": "Barcelona",
    "BIZKAIA": "Bizkaia",
    "BURGOS": "Burgos",
    "CACERES": "Cáceres",
    "CADIZ": "Cádiz",
    "CANTABRIA": "Cantabria",
    "CASTELLON": "Castelló/Castellón",
    "CEUTA": "Ceuta",
    "CIUDAD REAL": "Ciudad Real",
    "CORDOBA": "Córdoba",
    "CUENCA": "Cuenca",
    "GIPUZKOA": "Gipuzkoa",
    "GIRONA": "Girona",
    "GRANADA": "Granada",
    "GUADALAJARA": "Guadalajara",
    "HUELVA": "Huelva",
    "HUESCA": "Huesca",
    "ILLES BALEARS": "Illes Balears",
    "JAEN": "Jaén",
    "LA RIOJA": "La Rioja",
    "LAS PALMAS": "Las Palmas",
    "LEON": "León",
    "LLEIDA": "Lleida",
    "LUGO": "Lugo",
    "MADRID": "Madrid",
    "MALAGA": "Málaga",
    "MELILLA": "Melilla",
    "MURCIA": "Murcia",
    "NAVARRA": "Navarra",
    "OURENSE": "Ourense",
    "PALENCIA": "Palencia",
    "PONTEVEDRA": "Pontevedra",
    "SALAMANCA": "Salamanca",
    "SEGOVIA": "Segovia",
    "SEVILLA": "Sevilla",
    "SORIA": "Soria",
    "STA. CRUZ DE TENERIFE": "Santa Cruz de Tenerife",
    "TARRAGONA": "Tarragona",
    "TERUEL": "Teruel",
    "TOLEDO": "Toledo",
    "VALENCIA": "València/Valencia",
    "VALLADOLID": "Valladolid",
    "ZAMORA": "Zamora",
    "ZARAGOZA": "Zaragoza",
}

In [18]:
aemet_data["province"] = aemet_data["province"].map(province_name_mapping)

In [19]:
aemet_daily_data = (
    aemet_data.drop(columns=["station_id", "station_name"])
    .groupby(by=["province", "date"], as_index=False)
    .mean()
)

In [20]:
aemet_weekly_data = (
    aemet_daily_data.set_index(keys="date")
    .groupby(by="province")
    .resample("W")
    .agg(
        {
            "temperature_average": "mean",
            "temperature_min": "min",
            "temperature_max": "max",
            "precipitation": "sum",
            "wind_speed_average": "mean",
            "solar_radiation": "mean",
            "pressure_max": "max",
            "pressure_min": "min",
            "humidity_average": "mean",
            "humidity_max": "max",
            "humidity_min": "min",
        }
    )
    .reset_index()
)

In [21]:
aemet_monthly_data = (
    aemet_daily_data.set_index(keys="date")
    .groupby(by="province")
    .resample("MS")
    .agg(
        {
            "temperature_average": "mean",
            "temperature_min": "min",
            "temperature_max": "max",
            "precipitation": "sum",
            "wind_speed_average": "mean",
            "solar_radiation": "mean",
            "pressure_max": "max",
            "pressure_min": "min",
            "humidity_average": "mean",
            "humidity_max": "max",
            "humidity_min": "min",
        }
    )
    .reset_index()
)

In [22]:
output_path = Path('../data/processed/aemet/')

In [23]:
aemet_daily_data.to_parquet(output_path / "aemet_daily_data.parquet", index=False)

In [24]:
aemet_weekly_data.to_parquet(output_path / "aemet_weekly_data.parquet", index=False)

In [25]:
aemet_monthly_data.to_parquet(output_path / "aemet_monthly_data.parquet", index=False)