In [1]:
import pandas as pd
from pathlib import Path

path = Path("../data/pm10_era5_land_era5_reanalysis_blh.csv")

if not path.exists():
    raise FileNotFoundError(f"File not found: {path.resolve()}")

# Read a small sample to detect possible datetime-like columns
sample = pd.read_csv(path, nrows=10)
date_candidates = [c for c in sample.columns if c.lower() in ("time", "date", "datetime", "timestamp")]

if date_candidates:
    df = pd.read_csv(path, parse_dates=date_candidates, infer_datetime_format=True)
else:
    # fallback: load without parsing dates
    df = pd.read_csv(path)

print("Loaded:", path.name)
print("Shape:", df.shape)
print("\nColumns and dtypes:")
print(df.dtypes)
print("\nFirst 5 rows:")
print(df.head())

  df = pd.read_csv(path, parse_dates=date_candidates, infer_datetime_format=True)


Loaded: pm10_era5_land_era5_reanalysis_blh.csv
Shape: (96432, 741)

Columns and dtypes:
datetime                    datetime64[ns]
pm10_Alto-Adige_AB2                float64
pm10_Alto-Adige_BR1                float64
pm10_Alto-Adige_BX1                float64
pm10_Alto-Adige_BZ4                float64
                                 ...      
wind_v_10m_Veneto_502608           float64
wind_v_10m_Veneto_502609           float64
wind_v_10m_Veneto_502612           float64
wind_v_10m_Veneto_502701           float64
wind_v_10m_Veneto_502720           float64
Length: 741, dtype: object

First 5 rows:
             datetime  pm10_Alto-Adige_AB2  pm10_Alto-Adige_BR1  \
0 2014-01-01 00:00:00                  9.0                 30.0   
1 2014-01-01 01:00:00                 13.0                 40.0   
2 2014-01-01 02:00:00                 13.0                 40.0   
3 2014-01-01 03:00:00                 14.0                 43.0   
4 2014-01-01 04:00:00                 19.0                 57.

In [2]:
# Check units of some variables
t2m_col = [c for c in df.columns if 'temperature_2m' in c][0]
tp_col = [c for c in df.columns if 'total_precipitation' in c][0]

print(f"Temperature ({t2m_col}) first 5 values:")
print(df[t2m_col].head())

print(f"\nPrecipitation ({tp_col}) first 5 values:")
print(df[tp_col].head())

Temperature (temperature_2m_Alto-Adige_AB2) first 5 values:
0    261.36578
1    260.74470
2    260.25165
3    259.71320
4    259.20172
Name: temperature_2m_Alto-Adige_AB2, dtype: float64

Precipitation (total_precipitation_Alto-Adige_AB2) first 5 values:
0   -1.824446e-09
1    0.000000e+00
2    0.000000e+00
3    0.000000e+00
4    0.000000e+00
Name: total_precipitation_Alto-Adige_AB2, dtype: float64


In [3]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [4]:
# first, create 8 df with only the informations of trentinos stations
# then, create the column that indicates the station
# then add the other features
# then rename the features

In [5]:
# Create 8 dataframes for the listed Trentino stations.
# Assumes `df` (the full dataframe) already exists in the notebook.

stations = [
    ("402212", "PIANA ROTALIANA"),
    ("402204", "RIVA GAR"),
    ("402203", "MONTE GAZZA"),
    ("402209", "TRENTO PSC"),
    ("402206", "ROVERETO LGP"),
    ("402211", "TRENTO VBZ"),
    ("402213", "AVIO A22"),
    ("402201", "BORGO VAL"),
]

# find a datetime-like column in df
_dt_candidates = [c for c in df.columns if c.lower() in ("time", "date", "datetime", "timestamp")]
if _dt_candidates:
    dt_col = _dt_candidates[0]
else:
    _dt_candidates = df.select_dtypes(include=["datetime", "datetime64[ns]", "datetimetz"]).columns.tolist()
    dt_col = _dt_candidates[0] if _dt_candidates else next((c for c in df.columns if "time" in c.lower() or "date" in c.lower()), None)

if dt_col is None:
    raise ValueError("No datetime-like column found in df. Ensure df has a time/date column.")

station_dfs = {}
for code, name in stations:
    # select columns that contain the station code plus the datetime column
    code_str = str(code)
    cols = [c for c in df.columns if code_str in c]  # station-specific columns
    if not cols:
        # no columns found for this code - create empty dataframe with datetime and station only
        sub = df[[dt_col]].copy()
    else:
        sub = df[[dt_col] + cols].copy()
    # standardize datetime column name
    sub = sub.rename(columns={dt_col: "datetime"})
    # add station name column (not the code)
    sub["station"] = name
    # reorder: datetime, station, then data columns
    data_cols = [c for c in sub.columns if c not in ("datetime", "station")]
    sub = sub[["datetime", "station"] + data_cols]
    station_dfs[name] = sub

    # also expose a variable named df_<station_name_sanitized> for convenience
    safe = name.strip().lower().replace(" ", "_").replace("-", "_")
    safe = "".join(ch for ch in safe if ch.isalnum() or ch == "_")
    globals()[f"df_{safe}"] = sub

# quick summary
for name, sdf in station_dfs.items():
    print(f"{name}: {sdf.shape[0]} rows, {sdf.shape[1]} cols -> variable: df_{name.strip().lower().replace(' ', '_').replace('-', '_')}")

PIANA ROTALIANA: 96432 rows, 22 cols -> variable: df_piana_rotaliana
RIVA GAR: 96432 rows, 22 cols -> variable: df_riva_gar
MONTE GAZZA: 96432 rows, 22 cols -> variable: df_monte_gazza
TRENTO PSC: 96432 rows, 22 cols -> variable: df_trento_psc
ROVERETO LGP: 96432 rows, 22 cols -> variable: df_rovereto_lgp
TRENTO VBZ: 96432 rows, 22 cols -> variable: df_trento_vbz
AVIO A22: 96432 rows, 22 cols -> variable: df_avio_a22
BORGO VAL: 96432 rows, 22 cols -> variable: df_borgo_val


In [6]:
station_dfs["PIANA ROTALIANA"].head()

Unnamed: 0,datetime,station,pm10_Trentino_402212,Vwind_550_Trentino_402212,Vwind_850_Trentino_402212,Vwind_950_Trentino_402212,blh_Trentino_402212,humidity_550_Trentino_402212,humidity_850_Trentino_402212,humidity_950_Trentino_402212,...,temperature_2m_Trentino_402212,temperature_550_Trentino_402212,temperature_850_Trentino_402212,temperature_950_Trentino_402212,total_precipitation_Trentino_402212,uwind_550_Trentino_402212,uwind_850_Trentino_402212,uwind_950_Trentino_402212,wind_u_10m_Trentino_402212,wind_v_10m_Trentino_402212
0,2014-01-01 00:00:00,PIANA ROTALIANA,22.0,1.192086,0.206464,-0.381735,12.594571,14.923161,43.624295,73.900136,...,261.84625,256.11664,270.87134,275.47534,-1.824446e-09,6.551009,0.721515,0.402997,0.119293,-0.469666
1,2014-01-01 01:00:00,PIANA ROTALIANA,20.0,1.775102,0.121889,-0.384663,12.31566,13.631299,43.624054,72.633224,...,261.17047,256.01196,271.04175,275.38,0.0,7.650313,0.544,0.39353,0.113068,-0.459229
2,2014-01-01 02:00:00,PIANA ROTALIANA,22.0,2.022626,0.28723,-0.398431,12.309229,16.380247,43.281698,73.693506,...,260.67352,255.89466,271.111087,275.420853,0.0,7.904316,0.625748,0.422946,0.089264,-0.467911
3,2014-01-01 03:00:00,PIANA ROTALIANA,20.0,2.27015,0.452572,-0.412199,12.111033,19.129195,42.939341,74.753788,...,260.04718,255.77736,271.180423,275.461707,0.0,8.158318,0.707496,0.452362,0.087341,-0.477798
4,2014-01-01 04:00:00,PIANA ROTALIANA,18.0,2.517673,0.617913,-0.425967,11.918034,21.878143,42.596985,75.81407,...,259.39508,255.66006,271.24976,275.50256,0.0,8.412321,0.789245,0.481778,0.094604,-0.460266


In [7]:
import re
import numpy as np

# Rename columns in all station dataframes created earlier (station_dfs and df_<safe> globals).
# Will map available columns to the requested target names when possible,
# compute wind speed/direction from 10m u/v components, and report which target columns were not found.


# target column names provided by user
target_cols = [c.strip() for c in """Data,Stazione_APPA,PM10_(ug.m-3),Stazione_Meteo_Vicina,ID_Stazione_Meteo_Vicina,Latitudine,Longitudine,Precipitazione_(mm),Temperatura_(°C),Umid_relativa_(%),Direzione_Vento_media_(°),Vel_Vento_media_(m/s),Pressione_Atm_(hPa),Radiaz_Solare_tot_(kJ/m2),BG_Calusco_D_Adda_PM10_(ug.m-3),BG_Calusco_D_Adda_Latitudine,BG_Calusco_D_Adda_Longitudine,BG_Calusco_D_Adda_ID,BG_Osio_Sotto_PM10_(ug.m-3),BG_Osio_Sotto_Latitudine,BG_Osio_Sotto_Longitudine,BG_Osio_Sotto_ID,BG_Treviglio_PM10_(ug.m-3),BG_Treviglio_Latitudine,BG_Treviglio_Longitudine,BG_Treviglio_ID,BG_Via_Garibaldi_PM10_(ug.m-3),BG_Via_Garibaldi_Latitudine,BG_Via_Garibaldi_Longitudine,BG_Via_Garibaldi_ID,BL_Area_Feltrina_PM10_(ug.m-3),BL_Area_Feltrina_Latitudine,BL_Area_Feltrina_Longitudine,BL_Area_Feltrina_ID,BL_Parco_Città_di_Bologna_PM10_(ug.m-3),BL_Parco_Città_di_Bologna_Latitudine,BL_Parco_Città_di_Bologna_Longitudine,BL_Parco_Città_di_Bologna_ID,BL_Pieve_D_Alpago_PM10_(ug.m-3),BL_Pieve_D_Alpago_Latitudine,BL_Pieve_D_Alpago_Longitudine,BL_Pieve_D_Alpago_ID,BS_Palazzo_del_Broletto_PM10_(ug.m-3),BS_Palazzo_del_Broletto_Latitudine,BS_Palazzo_del_Broletto_Longitudine,BS_Palazzo_del_Broletto_ID,BS_Sarezzo_PM10_(ug.m-3),BS_Sarezzo_Latitudine,BS_Sarezzo_Longitudine,BS_Sarezzo_ID,CR_Piazza_Cadorna_PM10_(ug.m-3),CR_Piazza_Cadorna_Latitudine,CR_Piazza_Cadorna_Longitudine,CR_Piazza_Cadorna_ID,FE_Corso_Isonzo_PM10_(ug.m-3),FE_Corso_Isonzo_Latitudine,FE_Corso_Isonzo_Longitudine,FE_Corso_Isonzo_ID,LC_Valmadrera_PM10_(ug.m-3),LC_Valmadrera_Latitudine,LC_Valmadrera_Longitudine,LC_Valmadrera_ID,MN_Ponti_sul_Mincio_PM10_(ug.m-3),MN_Ponti_sul_Mincio_Latitudine,MN_Ponti_sul_Mincio_Longitudine,MN_Ponti_sul_Mincio_ID,MN_Sant_Agnese_PM10_(ug.m-3),MN_Sant_Agnese_Latitudine,MN_Sant_Agnese_Longitudine,MN_Sant_Agnese_ID,MO_Via_Ramesina_PM10_(ug.m-3),MO_Via_Ramesina_Latitudine,MO_Via_Ramesina_Longitude,MO_Via_Ramesina_ID,PD_Alta_Padovana_PM10_(ug.m-3),PD_Alta_Padovana_Latitudine,PD_Alta_Padovana_Longitudine,PD_Alta_Padovana_ID,PD_Arcella_PM10_(ug.m-3),PD_Arcella_Latitudine,PD_Arcella_Longitudine,PD_Arcella_ID,PD_Este_PM10_(ug.m-3),PD_Este_Latitudine,PD_Este_Longitudine,PD_Este_ID,PD_Granze_PM10_(ug.m-3),PD_Granze_Latitudine,PD_Granze_Longitudine,PD_Granze_ID,PR_Via_Saragat_PM10_(ug.m-3),PR_Via_Saragat_Latitudine,PR_Via_Saragat_Longitudine,PR_Via_Saragat_ID,RE_San_Rocco_PM10_(ug.m-3),RE_San_Rocco_Latitudine,RE_San_Rocco_Longitudine,RE_San_Rocco_ID,RO_Largo_Martiri_PM10_(ug.m-3),RO_Largo_Martiri_Latitudine,RO_Largo_Martiri_Longitudine,RO_Largo_Martiri_ID,TV_Conegliano_PM10_(ug.m-3),TV_Conegliano_Latitudine,TV_Conegliano_Longitudine,TV_Conegliano_ID,TV_Mansuè_PM10_(ug.m-3),TV_Mansuè_Latitudine,TV_Mansuè_Longitude,TV_Mansuè_ID,TV_Via_Lancieri_di_Novara_PM10_(ug.m-3),TV_Via_Lancieri_di_Novara_Latitudine,TV_Via_Lancieri_di_Novara_Longitudine,TV_Via_Lancieri_di_Novara_ID,VE_Sacca_Fisola_PM10_(ug.m-3),VE_Sacca_Fisola_Latitudine,VE_Sacca_Fisola_Longitudine,VE_Sacca_Fisola_ID,VE_Via_Tagliamento_PM10_(ug.m-3),VE_Via_Tagliamento_Latitudine,VE_Via_Tagliamento_Longitudine,VE_Via_Tagliamento_ID,VI_Quartiere_Italia_PM10_(ug.m-3),VI_Quartiere_Italia_Latitudine,VI_Quartiere_Italia_Longitudine,VI_Quartiere_Italia_ID,VR_Borgo_Milano_PM10_(ug.m-3),VR_Borgo_Milano_Latitudine,VR_Borgo_Milano_Longitudine,VR_Borgo_Milano_ID,VR_Bosco_Chiesanuova_PM10_(ug.m-3),VR_Bosco_Chiesanuova_Latitudine,VR_Bosco_Chiesanuova_Longitude,VR_Bosco_Chiesanuova_ID,VR_Legnago_PM10_(ug.m-3),VR_Legnago_Latitudine,VR_Legnago_Longitudine,VR_Legnago_ID,VR_San_Bonifacio_PM10_(ug.m-3),VR_San_Bonifacio_Latitudine,VR_San_Bonifacio_Longitude,VR_San_Bonifacio_ID,Humidity_550hPa,Humidity_950hPa,Temperature_550hPa,Temperature_850hPa,Temperature_950hPa,Uwind_550hPa,Uwind_850hPa,Uwind_950hPa,Vwind_550hPa,Vwind_850hPa,Vwind_950hPa,blh_mean_daily""".split(",")]

# Helper mapping from variable base (without station suffix) to target name(s)
base_to_target = {
    "datetime": "Data",
    "station": "Stazione_APPA",
    "pm10": "PM10_(ug.m-3)",
    "temperature_2m": "Temperatura_(°C)",
    "total_precipitation": "Precipitazione_(mm)",
    "humidity_950": "Umid_relativa_(%)",
    "solar_radiation_downwards": "Radiaz_Solare_tot_(kJ/m2)",
    "surface_pressure": "Pressione_Atm_(hPa)",
    "blh": "blh_mean_daily",
    # level-specific and wind-levels handled below
}

level_map = {
    "humidity_550": "Humidity_550hPa",
    "humidity_950": "Humidity_950hPa",
    "temperature_550": "Temperature_550hPa",
    "temperature_850": "Temperature_850hPa",
    "temperature_950": "Temperature_950hPa",
    "uwind_550": "Uwind_550hPa",
    "uwind_850": "Uwind_850hPa",
    "uwind_950": "Uwind_950hPa",
    "Vwind_550": "Vwind_550hPa",
    "Vwind_850": "Vwind_850hPa",
    "Vwind_950": "Vwind_950hPa",
}

# collect union of all columns across station dataframes
all_existing_cols = set()
for sdf in station_dfs.values():
    all_existing_cols.update(sdf.columns.astype(str).tolist())

found_targets = set()
rename_report = {}

# function to extract base var before final _Trentino_ or other suffix
def extract_base(col):
    m = re.match(r"^(?P<base>.+?)_(?:Trentino|Alto-Adige|Lombardia|Veneto|[A-Za-z0-9_-]+)_\d+$", col)
    if m:
        return m.group("base")
    # also try to capture wind_u_10m_ style name
    m2 = re.match(r"^(?P<base>.+?)(_Trentino_\d+)$", col)
    if m2:
        return m2.group("base")
    return col  # fallback: return original

for name, sdf in station_dfs.items():
    ren = {}
    # track if both 10m u and v exist to compute speed/direction
    u_col = v_col = None

    for col in sdf.columns.astype(str).tolist():
        if col == "datetime":
            ren[col] = "Data"
            found_targets.add("Data")
            continue
        if col == "station":
            ren[col] = "Stazione_APPA"
            found_targets.add("Stazione_APPA")
            continue

        base = extract_base(col)

        # direct mapping
        if base in base_to_target:
            ren[col] = base_to_target[base]
            found_targets.add(base_to_target[base])
            continue

        # level-specific mapping
        # Try matching patterns like humidity_550, temperature_850, uwind_550, Vwind_850, blh, etc.
        # Remove trailing station suffix if any
        # normalize base case for Vwind uppercase V
        base_norm = base
        # handle wind 10m components
        if base.startswith("wind_u_10m") or base.startswith("wind_u_10m"):
            u_col = col
            continue
        if base.startswith("wind_v_10m") or base.startswith("wind_v_10m"):
            v_col = col
            continue

        # handle typical prefixes
        if base in level_map:
            ren[col] = level_map[base]
            found_targets.add(level_map[base])
            continue

        # sometimes Vwind spelled with uppercase Vwind_...
        if base in ["Vwind_550", "Vwind_850", "Vwind_950"]:
            ren[col] = level_map[base]
            found_targets.add(level_map[base])
            continue

        # uwind lowercase
        if base in ["uwind_550", "uwind_850", "uwind_950"]:
            ren[col] = level_map[base]
            found_targets.add(level_map[base])
            continue

        # surface pressure present
        if base.startswith("surface_pressure"):
            ren[col] = "Pressione_Atm_(hPa)"
            found_targets.add("Pressione_Atm_(hPa)")
            continue

        # map 2m temp to Temperatura_(°C) already covered; if other unmatched small variables, keep as-is
        # (we won't rename them if no mapping found)
    # compute wind speed and direction if wind_u_10m_ and wind_v_10m_ present under other names
    # find columns that match wind_u_10m_... or wind_v_10m_...
    # search by regex for any col that contains 'wind_u_10m' and station code
    u_col = u_col or next((c for c in sdf.columns if re.search(r"wind_u_10m", c)), None)
    v_col = v_col or next((c for c in sdf.columns if re.search(r"wind_v_10m", c)), None)

    # If both found compute speed & direction and add to dataframe then rename (and optionally drop originals)
    if u_col and v_col:
        # compute speed and meteorological direction (degrees from which wind comes)
        # u = zonal (positive east), v = meridional (positive north)
        u_vals = sdf[u_col].astype(float)
        v_vals = sdf[v_col].astype(float)
        # speed
        speed = np.sqrt(u_vals ** 2 + v_vals ** 2)
        # meteorological wind direction (from which wind is coming), degrees clockwise from north
        # formula: dir = (np.degrees(np.arctan2(-u, -v)) + 360) % 360
        direction = (np.degrees(np.arctan2(-u_vals, -v_vals)) + 360) % 360

        # add columns (names from target list)
        sdf["Vel_Vento_media_(m/s)"] = speed
        sdf["Direzione_Vento_media_(°)"] = direction
        found_targets.add("Vel_Vento_media_(m/s)")
        found_targets.add("Direzione_Vento_media_(°)")

        # we will drop original 10m u/v columns and mark them renamed (if user prefers to keep, comment out the next two lines)
        ren[u_col] = None
        ren[v_col] = None

    # apply renames (filter out None -> drop)
    # build final rename dict excluding None items
    apply_ren = {k: v for k, v in ren.items() if v is not None}
    # perform rename inplace
    if apply_ren:
        sdf.rename(columns=apply_ren, inplace=True)

    # drop any columns explicitly marked None in ren (original wind components)
    to_drop = [k for k, v in ren.items() if v is None]
    if to_drop:
        sdf.drop(columns=to_drop, inplace=True, errors="ignore")

    # record rename summary
    rename_report[name] = {"n_rows": sdf.shape[0], "n_cols": sdf.shape[1], "columns": sdf.columns.tolist()}

# After processing all station_dfs, mark other target columns found if they exist in any dataframe
for t in target_cols:
    for sdf in station_dfs.values():
        if t in sdf.columns:
            found_targets.add(t)
            break

missing_targets = [t for t in target_cols if t not in found_targets]

# Print concise report
print("Renaming completed for station_dfs. Summary (sample):")
for k, v in list(rename_report.items())[:5]:
    print(f"- {k}: {v['n_rows']} rows, {v['n_cols']} cols")

if missing_targets:
    print("\nTarget columns not found in any station dataframe (ignored):")
    for mt in missing_targets:
        print("-", mt)
else:
    print("\nAll target columns were found or created where possible.")

Renaming completed for station_dfs. Summary (sample):
- PIANA ROTALIANA: 96432 rows, 22 cols
- RIVA GAR: 96432 rows, 22 cols
- MONTE GAZZA: 96432 rows, 22 cols
- TRENTO PSC: 96432 rows, 22 cols
- ROVERETO LGP: 96432 rows, 22 cols

Target columns not found in any station dataframe (ignored):
- Stazione_Meteo_Vicina
- ID_Stazione_Meteo_Vicina
- Latitudine
- Longitudine
- Precipitazione_(mm)
- Temperatura_(°C)
- Umid_relativa_(%)
- Pressione_Atm_(hPa)
- Radiaz_Solare_tot_(kJ/m2)
- BG_Calusco_D_Adda_PM10_(ug.m-3)
- BG_Calusco_D_Adda_Latitudine
- BG_Calusco_D_Adda_Longitudine
- BG_Calusco_D_Adda_ID
- BG_Osio_Sotto_PM10_(ug.m-3)
- BG_Osio_Sotto_Latitudine
- BG_Osio_Sotto_Longitudine
- BG_Osio_Sotto_ID
- BG_Treviglio_PM10_(ug.m-3)
- BG_Treviglio_Latitudine
- BG_Treviglio_Longitudine
- BG_Treviglio_ID
- BG_Via_Garibaldi_PM10_(ug.m-3)
- BG_Via_Garibaldi_Latitudine
- BG_Via_Garibaldi_Longitudine
- BG_Via_Garibaldi_ID
- BL_Area_Feltrina_PM10_(ug.m-3)
- BL_Area_Feltrina_Latitudine
- BL_Area_Feltri

In [8]:
station_dfs["PIANA ROTALIANA"].head()

Unnamed: 0,Data,Stazione_APPA,PM10_(ug.m-3),Vwind_550_Trentino_402212,Vwind_850_Trentino_402212,Vwind_950_Trentino_402212,blh_mean_daily,humidity_550_Trentino_402212,humidity_850_Trentino_402212,humidity_950_Trentino_402212,...,temperature_2m_Trentino_402212,temperature_550_Trentino_402212,temperature_850_Trentino_402212,temperature_950_Trentino_402212,total_precipitation_Trentino_402212,uwind_550_Trentino_402212,uwind_850_Trentino_402212,uwind_950_Trentino_402212,Vel_Vento_media_(m/s),Direzione_Vento_media_(°)
0,2014-01-01 00:00:00,PIANA ROTALIANA,22.0,1.192086,0.206464,-0.381735,12.594571,14.923161,43.624295,73.900136,...,261.84625,256.11664,270.87134,275.47534,-1.824446e-09,6.551009,0.721515,0.402997,0.484579,345.748469
1,2014-01-01 01:00:00,PIANA ROTALIANA,20.0,1.775102,0.121889,-0.384663,12.31566,13.631299,43.624054,72.633224,...,261.17047,256.01196,271.04175,275.38,0.0,7.650313,0.544,0.39353,0.472943,346.168201
2,2014-01-01 02:00:00,PIANA ROTALIANA,22.0,2.022626,0.28723,-0.398431,12.309229,16.380247,43.281698,73.693506,...,260.67352,255.89466,271.111087,275.420853,0.0,7.904316,0.625748,0.422946,0.476349,349.19939
3,2014-01-01 03:00:00,PIANA ROTALIANA,20.0,2.27015,0.452572,-0.412199,12.111033,19.129195,42.939341,74.753788,...,260.04718,255.77736,271.180423,275.461707,0.0,8.158318,0.707496,0.452362,0.485716,349.640738
4,2014-01-01 04:00:00,PIANA ROTALIANA,18.0,2.517673,0.617913,-0.425967,11.918034,21.878143,42.596985,75.81407,...,259.39508,255.66006,271.24976,275.50256,0.0,8.412321,0.789245,0.481778,0.469888,348.385018


In [9]:
# Normalizza i nomi delle colonne rimuovendo la parte "_Trentino_<codice>"
# e mappando i nomi base ai nomi finali richiesti.
# Usa le variabili già presenti in notebook: station_dfs e globals df_<safe>.

def _normalize(col: str) -> str:
    # colonne già finali o chiave
    if col in ("Data", "Stazione_APPA", "PM10_(ug.m-3)", "Vel_Vento_media_(m/s)", "Direzione_Vento_media_(°)", "blh_mean_daily"):
        return col

    # rimuovi suffisso _Trentino_<digits> se presente
    base = re.sub(r"_Trentino_\d+$", "", col)
    base = base.rstrip("_")  # pulizia eventuale underscore finale
    bl = base.lower()

    # mapping esplicito
    if bl.startswith("pm10"):
        return "PM10_(ug.m-3)"
    if bl in ("total_precipitation", "precipitation", "precipitation_amount", "total_precipitation_trentino"):
        return "Precipitazione_(mm)"
    if bl in ("temperature_2m", "t2m", "temperature_2m_trentino"):
        return "Temperatura_(°C)"
    if bl.startswith("humidity"):
        m = re.match(r"humidity[_]?(\d+)", bl)
        if m:
            return f"Humidity_{m.group(1)}hPa"
        return "Umid_relativa_(%)"
    if bl.startswith("temperature_") and bl != "temperature_2m":
        m = re.match(r"temperature[_]?(\d+)", bl)
        if m:
            return f"Temperature_{m.group(1)}hPa"
    if bl.startswith("uwind") or bl.startswith("uwind") or bl.startswith("wind_u"):
        m = re.search(r"(\d+)", bl)
        if m:
            return f"Uwind_{m.group(1)}hPa"
    if bl.startswith("vwind") or bl.startswith("Vwind") or bl.startswith("wind_v"):
        m = re.search(r"(\d+)", bl)
        if m:
            return f"Vwind_{m.group(1)}hPa"
    if "surface_pressure" in bl or "pressure" == bl:
        return "Pressione_Atm_(hPa)"
    if "solar" in bl or "radiation" in bl:
        return "Radiaz_Solare_tot_(kJ/m2)"
    if bl in ("blh", "blh_mean_daily"):
        return "blh_mean_daily"
    # se non matcha, ritorna il nome base (senza _Trentino_...) per uniformità
    return base

# Applica la normalizzazione a tutti i dataframe in station_dfs e aggiorna le variabili globali df_<safe>
for name, sdf in station_dfs.items():
    rename_map = {col: _normalize(col) for col in sdf.columns}

    # Se dovessero esserci colonne duplicate dopo la normalizzazione (rare), rendile uniche
    new_names = list(rename_map.values())
    dupes = {x for x in new_names if new_names.count(x) > 1}
    if dupes:
        # suffissa i duplicati con parte dell'originale per non perdere dati
        for orig, new in list(rename_map.items()):
            if new in dupes and orig not in ("Data", "Stazione_APPA"):
                short = re.sub(r".*?([A-Za-z0-9]+)$", r"\1", orig)
                rename_map[orig] = f"{new}_{short}"

    sdf.rename(columns=rename_map, inplace=True)

    # aggiorna dict e variabile globale comoda
    station_dfs[name] = sdf
    safe = name.strip().lower().replace(" ", "_").replace("-", "_")
    safe = "".join(ch for ch in safe if ch.isalnum() or ch == "_")
    globals()[f"df_{safe}"] = sdf

# report conciso
print("Normalizzazione completata. Esempio colonne per stazione:")
for k, v in station_dfs.items():
    print(f"- {k}: {v.columns.tolist()[:20]}")

Normalizzazione completata. Esempio colonne per stazione:
- PIANA ROTALIANA: ['Data', 'Stazione_APPA', 'PM10_(ug.m-3)', 'Vwind_550hPa', 'Vwind_850hPa', 'Vwind_950hPa', 'blh_mean_daily', 'Humidity_550hPa', 'Humidity_850hPa', 'Humidity_950hPa', 'Radiaz_Solare_tot_(kJ/m2)', 'Pressione_Atm_(hPa)', 'Temperatura_(°C)', 'Temperature_550hPa', 'Temperature_850hPa', 'Temperature_950hPa', 'Precipitazione_(mm)', 'Uwind_550hPa', 'Uwind_850hPa', 'Uwind_950hPa']
- RIVA GAR: ['Data', 'Stazione_APPA', 'PM10_(ug.m-3)', 'Vwind_550hPa', 'Vwind_850hPa', 'Vwind_950hPa', 'blh_mean_daily', 'Humidity_550hPa', 'Humidity_850hPa', 'Humidity_950hPa', 'Radiaz_Solare_tot_(kJ/m2)', 'Pressione_Atm_(hPa)', 'Temperatura_(°C)', 'Temperature_550hPa', 'Temperature_850hPa', 'Temperature_950hPa', 'Precipitazione_(mm)', 'Uwind_550hPa', 'Uwind_850hPa', 'Uwind_950hPa']
- MONTE GAZZA: ['Data', 'Stazione_APPA', 'PM10_(ug.m-3)', 'Vwind_550hPa', 'Vwind_850hPa', 'Vwind_950hPa', 'blh_mean_daily', 'Humidity_550hPa', 'Humidity_850hPa',

In [10]:
# Aggiorna i nomi delle stazioni in station_dfs e nelle variabili globali df_<safe>
new_names = [
    "Riva del Garda",
    "Parco S. Chiara",
    "Borgo Valsugana",
    "Rovereto",
    "Via Bolzano",
    "Piana Rotaliana",
    "Riva del Garda",
    "Monte Gaza",
]

# usa la lista `stations` definita in precedenza per mantenere l'ordine/corrispondenza
if len(new_names) != len(stations):
    raise ValueError("La lista new_names deve avere la stessa lunghezza di stations")

new_station_dfs = {}
used_safes = set()

for i, (code, old_name) in enumerate(stations):
    # trova la chiave esatta in station_dfs (tolleranza case-insensitive)
    key = next((k for k in station_dfs.keys() if k.lower() == old_name.lower()), None)
    if key is None:
        raise KeyError(f"Stazione originale non trovata in station_dfs: {old_name}")

    sdf = station_dfs.pop(key)

    new_name = new_names[i]
    # aggiorna il valore nella colonna che contiene il nome della stazione, se presente
    if "Stazione_APPA" in sdf.columns:
        sdf["Stazione_APPA"] = new_name
    elif "station" in sdf.columns:
        sdf["station"] = new_name

    # salva con la nuova chiave
    new_station_dfs[new_name] = sdf

    # aggiorna/crea la variabile globale df_<safe> (evita collisioni rendendo il nome unico se necessario)
    safe = new_name.strip().lower().replace(" ", "_").replace("-", "_")
    safe = "".join(ch for ch in safe if ch.isalnum() or ch == "_")
    if safe in used_safes:
        safe = f"{safe}_{i}"
    used_safes.add(safe)
    globals()[f"df_{safe}"] = sdf

    # rimuovi eventuale variabile globale vecchia se esiste e ha nome diverso
    old_safe = key.strip().lower().replace(" ", "_").replace("-", "_")
    old_safe = "".join(ch for ch in old_safe if ch.isalnum() or ch == "_")
    old_var = f"df_{old_safe}"
    if old_var in globals() and old_var != f"df_{safe}":
        try:
            del globals()[old_var]
        except Exception:
            pass

# assegna il dict aggiornato
station_dfs = new_station_dfs

# report conciso
print("Rinominamento stazioni completato. Nuove chiavi in station_dfs:")
for k in station_dfs.keys():
    print("-", k)

Rinominamento stazioni completato. Nuove chiavi in station_dfs:
- Riva del Garda
- Parco S. Chiara
- Borgo Valsugana
- Rovereto
- Via Bolzano
- Piana Rotaliana
- Monte Gaza


In [11]:
station_dfs["Monte Gaza"].head()

Unnamed: 0,Data,Stazione_APPA,PM10_(ug.m-3),Vwind_550hPa,Vwind_850hPa,Vwind_950hPa,blh_mean_daily,Humidity_550hPa,Humidity_850hPa,Humidity_950hPa,...,Temperatura_(°C),Temperature_550hPa,Temperature_850hPa,Temperature_950hPa,Precipitazione_(mm),Uwind_550hPa,Uwind_850hPa,Uwind_950hPa,Vel_Vento_media_(m/s),Direzione_Vento_media_(°)
0,2014-01-01 00:00:00,Monte Gaza,38.0,0.37747,0.117841,-0.375876,14.344571,16.512354,42.721968,82.085683,...,264.86774,256.187113,271.148113,276.29729,-1.824901e-09,6.064274,-0.069826,-0.801267,0.898279,44.423278
1,2014-01-01 01:00:00,Monte Gaza,41.0,0.915238,0.045229,-0.282612,14.25316,14.24458,42.72171,80.22795,...,264.54742,256.05212,271.28052,276.2328,0.0,6.942794,-0.543891,-0.817896,0.877306,45.178983
2,2014-01-01 02:00:00,Monte Gaza,40.0,1.251141,0.198607,-0.289707,14.965479,15.25069,41.217244,80.314927,...,264.3083,255.955087,271.286787,276.172333,0.0,7.288431,-0.335434,-0.82388,0.913549,46.059095
3,2014-01-01 03:00:00,Monte Gaza,38.0,1.587044,0.351986,-0.296802,15.142283,16.256799,39.712779,80.401903,...,264.01007,255.858053,271.293053,276.111867,0.0,7.634067,-0.126976,-0.829865,0.923893,45.740739
4,2014-01-01 04:00:00,Monte Gaza,31.0,1.922947,0.505364,-0.303897,14.449284,17.262909,38.208313,80.48888,...,263.68805,255.76102,271.29932,276.0514,0.0,7.979704,0.081481,-0.835849,0.911137,45.382674


In [12]:
# Attach to each station dataframe a copy of the original `df` with Trentino station columns removed.
# Merge on the datetime column (station dataframes use "Data").
# Result: station_dfs[name] updated and global df_<safe> variables refreshed.

# identify datetime column in original df
_dt = next((c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c]) or c.lower() in ("time", "date", "datetime", "timestamp")), None)
if _dt is None:
    # fallback: try index if it's datetime-like
    if pd.api.types.is_datetime64_any_dtype(df.index):
        df = df.reset_index().rename(columns={"index": "Data"})
        _dt = "Data"
    else:
        raise ValueError("No datetime-like column found in original df")

# build df without Trentino station columns (keep datetime)
trentino_mask = df.columns.str.contains(r"_Trentino_", case=False, regex=True)
cols_to_keep = [c for c in df.columns if (not trentino_mask[list(df.columns).index(c)])]
df_no_trentino = df[cols_to_keep].copy()

# ensure datetime column is named "Data" to match station_dfs
if _dt != "Data":
    df_no_trentino = df_no_trentino.rename(columns={_dt: "Data"})

# merge into each station dataframe (left join so station rows preserved)
for name, sdf in list(station_dfs.items()):
    merged = sdf.merge(df_no_trentino, on="Data", how="left", suffixes=("", "_orig"))
    # update dict and global variable
    station_dfs[name] = merged
    safe = name.strip().lower().replace(" ", "_").replace("-", "_")
    safe = "".join(ch for ch in safe if ch.isalnum() or ch == "_")
    globals()[f"df_{safe}"] = merged

# concise report
print("Attach completo. Dimensioni esempio per ciascuna stazione:")
for k, v in station_dfs.items():
    print(f"- {k}: {v.shape[0]} rows, {v.shape[1]} cols -> variable: df_{k.strip().lower().replace(' ', '_').replace('-', '_')}")

Attach completo. Dimensioni esempio per ciascuna stazione:
- Riva del Garda: 96432 rows, 602 cols -> variable: df_riva_del_garda
- Parco S. Chiara: 96432 rows, 602 cols -> variable: df_parco_s._chiara
- Borgo Valsugana: 96432 rows, 602 cols -> variable: df_borgo_valsugana
- Rovereto: 96432 rows, 602 cols -> variable: df_rovereto
- Via Bolzano: 96432 rows, 602 cols -> variable: df_via_bolzano
- Piana Rotaliana: 96432 rows, 602 cols -> variable: df_piana_rotaliana
- Monte Gaza: 96432 rows, 602 cols -> variable: df_monte_gaza


In [13]:
# Print all column names for the "Monte Gaza" dataframe (tries station_dfs, then df_monte_gaza, then sdf)
if "station_dfs" in globals() and "Monte Gaza" in station_dfs:
    cols = station_dfs["Monte Gaza"].columns
elif "df_monte_gaza" in globals():
    cols = df_monte_gaza.columns
elif "sdf" in globals():
    cols = sdf.columns
else:
    raise NameError("Monte Gaza dataframe not found (station_dfs['Monte Gaza'] or df_monte_gaza or sdf).")

for col in cols:
    print(col)

Data
Stazione_APPA
PM10_(ug.m-3)
Vwind_550hPa
Vwind_850hPa
Vwind_950hPa
blh_mean_daily
Humidity_550hPa
Humidity_850hPa
Humidity_950hPa
Radiaz_Solare_tot_(kJ/m2)
Pressione_Atm_(hPa)
Temperatura_(°C)
Temperature_550hPa
Temperature_850hPa
Temperature_950hPa
Precipitazione_(mm)
Uwind_550hPa
Uwind_850hPa
Uwind_950hPa
Vel_Vento_media_(m/s)
Direzione_Vento_media_(°)
pm10_Alto-Adige_AB2
pm10_Alto-Adige_BR1
pm10_Alto-Adige_BX1
pm10_Alto-Adige_BZ4
pm10_Alto-Adige_BZ5
pm10_Alto-Adige_LA1
pm10_Alto-Adige_LS1
pm10_Alto-Adige_ME1
pm10_Lombardia_ARPAL_001
pm10_Lombardia_ARPAL_002
pm10_Lombardia_ARPAL_005
pm10_Lombardia_ARPAL_007
pm10_Lombardia_ARPAL_008
pm10_Lombardia_ARPAL_011
pm10_Lombardia_ARPAL_012
pm10_Lombardia_ARPAL_017
pm10_Lombardia_ARPAL_019
pm10_Lombardia_ARPAL_020
pm10_Lombardia_ARPAL_022
pm10_Lombardia_ARPAL_023
pm10_Lombardia_ARPAL_026
pm10_Lombardia_ARPAL_027
pm10_Lombardia_ARPAL_028
pm10_Veneto_502604
pm10_Veneto_502608
pm10_Veneto_502609
pm10_Veneto_502612
pm10_Veneto_502701
pm10_Ven

In [14]:
# from io import StringIO

# # Rename columns in the big merged dataframe (`v`) and in each station dataframe in `station_dfs`
# # using the provided table as a dictionary. Adds station lat/long columns as well.
# # Uses existing notebook variables: v, station_dfs, pd, re


# # raw mapping (stops at ME1 row)
# _csv = """region,station_code,station_name,latitude,longitude
# Veneto,502604,Conegliano,45.8894444,12.3069444
# Veneto,502608,TV Lancieri,45.6716667,12.2377778
# Veneto,502609,Mansue,45.8368944,12.5104083
# Veneto,502612,TV S Agnese,45.6589167,12.2160833
# Veneto,502701,Bissuola,45.4996764,12.2614453
# Veneto,502720,VE Tagliamento,45.4883584,12.2170993
# Lombardia,ARPAL_001,"Bergamo - via Garibaldi PM10",45.6956859557,9.6612593592
# Lombardia,ARPAL_002,"Brescia - Broletto PM10",45.5400637713,10.2228199867
# Lombardia,ARPAL_005,"Calusco d' Adda PM10",45.6904356478,9.4842612412
# Lombardia,ARPAL_007,"Filago - via Don Milani PM10",45.6338721064,9.5560952366
# Lombardia,ARPAL_008,Lallio PM10,45.6596706133,9.6185789755
# Lombardia,ARPAL_011,"Mantova - p.zza Gramsci PM10",45.1517517783,10.781417503
# Lombardia,ARPAL_012,"Mantova - S.Agnese PM10",45.1605765516,10.7955736602
# Lombardia,ARPAL_017,Osio Sotto PM10,45.6205556794,9.6117378218
# Lombardia,ARPAL_019,Parona PM10,45.2819426350,8.7543362218
# Lombardia,ARPAL_020,"Pavia - p.zza Minerva PM10",45.1863357275,9.1466771328
# Lombardia,ARPAL_022,"Sannazzaro de' Burgondi - AGIP  PM10",45.1027739383,8.9041863358
# Lombardia,ARPAL_023,"Sarezzo - via Minelli PM10",45.6496267122,10.2050902188
# Lombardia,ARPAL_026,Valmadrera PM10,45.8422121938,9.3516638635
# Lombardia,ARPAL_027,"Vigevano - via Valletta PM10",45.3036140819,8.8448054859
# Lombardia,ARPAL_028,"Voghera - via Pozzoni PM10",44.9995542701,9.0084485483
# Alto-Adige,AB2,"A22 sud, Termeno, c/o Maso Binnenland",46.34365,11.27933
# Alto-Adige,BR1,"Brunico 1, c/o parcheggio di via Goethe",46.79735,11.94403
# Alto-Adige,BX1,"Bressanone, c/o Villa Adele in Via Stazione",46.71492,11.65408
# Alto-Adige,BZ4,"Bolzano 4-Via C.Augusta",46.48231,11.34183
# Alto-Adige,BZ5,"Bolzano 5-P.zza Adriano",46.49547,11.33997
# Alto-Adige,LA1,"Laces 1-Via Stazione, parcheggio fs",46.61947,10.85906
# Alto-Adige,LS1,"Laives 1, c/o campi sportivi zona Galizia",46.43483,11.34014
# Alto-Adige,ME1,"Merano 1, Via Trogman",46.66200,11.16163
# """

# df_map = pd.read_csv(StringIO(_csv), dtype=str)
# # keep lat/lon as floats
# df_map["latitude"] = pd.to_numeric(df_map["latitude"])
# df_map["longitude"] = pd.to_numeric(df_map["longitude"])

# # build dict keyed by station_code (exact match)
# map_by_code = df_map.set_index("station_code").to_dict(orient="index")

# def _sanitize(s: str) -> str:
#     s = re.sub(r"[^\w\s\-]", "", str(s))                # remove punctuation except underscore/hyphen
#     s = re.sub(r"[\s\-]+", "_", s.strip())             # spaces and hyphens -> underscore
#     s = re.sub(r"_+", "_", s)                          # collapse underscores
#     return s

# # mapping for variable suffixes (style similar to examples)
# var_suffix_map = {
#     "pm10": "PM10_(ug_m-3)",
#     "pm25": "PM25_(ug_m-3)",
#     "wind_u_10m": "Uwind_10m_(m_s-1)",
#     "wind_v_10m": "Vwind_10m_(m_s-1)",
#     "temperature_2m": "Temperatura_2m_(°C)",
#     "t2m": "Temperatura_2m_(°C)",
#     "total_precipitation": "Precipitazione_(mm)",
#     "precipitation": "Precipitazione_(mm)",
#     # fallback will use sanitized base name
# }

# def _var_suffix_for(base: str) -> str:
#     b = base.lower()
#     for k, v in var_suffix_map.items():
#         if b.startswith(k):
#             return v
#     # generic clean fallback
#     return re.sub(r"[^\w]", "_", base).strip("_")

# def build_rename_map_for_columns(cols):
#     rename = {}
#     latlon_to_add = {}  # colname -> (lat_col, lon_col, lat_val, lon_val)
#     pattern = re.compile(r"^(?P<base>.+?)_(?P<region>[A-Za-z\-]+)_(?P<code>.+)$")
#     for col in cols:
#         m = pattern.match(col)
#         if not m:
#             continue
#         base = m.group("base")
#         code = m.group("code")
#         # try exact code lookup first, then fallback to code stripped of trailing digits/underscores
#         info = map_by_code.get(code)
#         if info is None:
#             # sometimes code in cols may include extra trailing digits or different casing; try find any key that endswith code
#             candidates = [k for k in map_by_code.keys() if k.lower() == code.lower()]
#             if candidates:
#                 info = map_by_code[candidates[0]]
#         if info is None:
#             # no mapping found for this column -> skip
#             continue
#         station_name = info["station_name"]
#         lat = info["latitude"]
#         lon = info["longitude"]

#         sanitized = _sanitize(station_name)
#         suffix = _var_suffix_for(base)
#         new_col = f"{code}_{sanitized}_{suffix}"
#         # ensure unique
#         if new_col in rename.values():
#             # append original base tail
#             tail = re.sub(r".*?([A-Za-z0-9]+)$", r"\1", base)
#             new_col = f"{new_col}_{tail}"
#         rename[col] = new_col

#         # prepare lat/lon column names to add (if not present)
#         lat_col = f"{code}_{sanitized}_Latitudine"
#         lon_col = f"{code}_{sanitized}_Longitudine"
#         latlon_to_add[(lat_col, lon_col)] = (lat, lon)
#     return rename, latlon_to_add

# def apply_to_df(df):
#     rename_map, latlon_map = build_rename_map_for_columns(df.columns.astype(str).tolist())
#     if rename_map:
#         df.rename(columns=rename_map, inplace=True)
#     # add lat/lon columns (constant per station) if they do not already exist
#     for (lat_col, lon_col), (lat, lon) in latlon_map.items():
#         if lat_col not in df.columns:
#             df[lat_col] = lat
#         if lon_col not in df.columns:
#             df[lon_col] = lon
#     return rename_map.keys()

# # apply to big merged dataframe `v` if exists
# if "v" in globals():
#     changed_cols = apply_to_df(v)
#     print(f"Renamed {len(changed_cols)} columns in 'v' and added corresponding Lat/Long columns where missing.")

# # apply to each station dataframe inside station_dfs
# if "station_dfs" in globals():
#     for name, sdf in station_dfs.items():
#         changed = apply_to_df(sdf)
#         # update globals var name if exists
#         safe = name.strip().lower().replace(" ", "_").replace("-", "_")
#         safe = "".join(ch for ch in safe if ch.isalnum() or ch == "_")
#         globals()[f"df_{safe}"] = sdf

#     print("Applied renaming and Lat/Long addition to all dataframes in station_dfs.")
# else:
#     print("station_dfs not found in globals; nothing applied there.")

In [15]:
df.head()

Unnamed: 0,datetime,pm10_Alto-Adige_AB2,pm10_Alto-Adige_BR1,pm10_Alto-Adige_BX1,pm10_Alto-Adige_BZ4,pm10_Alto-Adige_BZ5,pm10_Alto-Adige_LA1,pm10_Alto-Adige_LS1,pm10_Alto-Adige_ME1,pm10_Lombardia_ARPAL_001,...,wind_v_10m_Trentino_402209,wind_v_10m_Trentino_402211,wind_v_10m_Trentino_402212,wind_v_10m_Trentino_402213,wind_v_10m_Veneto_502604,wind_v_10m_Veneto_502608,wind_v_10m_Veneto_502609,wind_v_10m_Veneto_502612,wind_v_10m_Veneto_502701,wind_v_10m_Veneto_502720
0,2014-01-01 00:00:00,9.0,30.0,17.0,39.0,16.0,49.0,28.0,15.0,37.39,...,-0.477478,-0.477478,-0.469666,-1.213806,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
1,2014-01-01 01:00:00,13.0,40.0,20.0,31.0,18.0,49.0,43.0,12.0,46.61,...,-0.483643,-0.483643,-0.459229,-1.221924,-1.493408,-1.589111,-1.736572,-1.589111,-1.559815,-1.237549
2,2014-01-01 02:00:00,13.0,40.0,20.0,31.0,18.0,49.0,70.0,12.0,55.83,...,-0.516739,-0.516739,-0.467911,-1.254044,-1.522598,-1.641739,-1.81752,-1.641739,-1.556778,-1.210098
3,2014-01-01 03:00:00,14.0,43.0,24.0,38.0,20.0,48.0,65.0,15.0,57.69,...,-0.538345,-0.538345,-0.477798,-1.291275,-1.510025,-1.599869,-1.780533,-1.599869,-1.484634,-1.127212
4,2014-01-01 04:00:00,19.0,57.0,45.0,72.0,27.0,42.0,52.0,29.0,59.55,...,-0.541321,-0.541321,-0.460266,-1.318665,-1.474915,-1.527649,-1.729797,-1.527649,-1.346985,-0.98761


In [16]:
# Combine all station dataframes into a single DataFrame with MultiIndex (Data, Stazione_APPA).
# Uses existing `station_dfs` in the notebook.

# ensure expected column exists
_any = next(iter(station_dfs.values()))
if "Data" not in _any.columns or "Stazione_APPA" not in _any.columns:
    raise KeyError("Expected each station dataframe to contain 'Data' and 'Stazione_APPA' columns.")

# concat and set MultiIndex (date, station)
df_stations = pd.concat(list(station_dfs.values()), ignore_index=True)
df_stations.set_index(["Data", "Stazione_APPA"], inplace=True)
df_stations.sort_index(inplace=True)

# expose variable name for convenience
globals()["df_stations"] = df_stations

# display a quick summary
print(df_stations.shape)
df_stations.head()

(675024, 600)


Unnamed: 0_level_0,Unnamed: 1_level_0,PM10_(ug.m-3),Vwind_550hPa,Vwind_850hPa,Vwind_950hPa,blh_mean_daily,Humidity_550hPa,Humidity_850hPa,Humidity_950hPa,Radiaz_Solare_tot_(kJ/m2),Pressione_Atm_(hPa),...,wind_v_10m_Lombardia_ARPAL_023,wind_v_10m_Lombardia_ARPAL_026,wind_v_10m_Lombardia_ARPAL_027,wind_v_10m_Lombardia_ARPAL_028,wind_v_10m_Veneto_502604,wind_v_10m_Veneto_502608,wind_v_10m_Veneto_502609,wind_v_10m_Veneto_502612,wind_v_10m_Veneto_502701,wind_v_10m_Veneto_502720
Data,Stazione_APPA,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-01-01,Borgo Valsugana,2.0,1.119332,-0.578367,-0.846091,11.938321,15.462224,36.544512,70.798897,0.0,88678.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
2014-01-01,Monte Gaza,38.0,0.37747,0.117841,-0.375876,14.344571,16.512354,42.721968,82.085683,0.0,86369.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
2014-01-01,Parco S. Chiara,23.0,0.810575,-0.018797,-1.115622,20.00082,15.01626,33.636798,76.419017,0.0,93343.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
2014-01-01,Piana Rotaliana,33.0,0.775908,0.035321,-0.35374,13.532071,15.499984,33.98216,82.258859,0.0,90865.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
2014-01-01,Riva del Garda,14.0,0.810575,-0.018797,-1.115622,20.00082,15.01626,33.636798,76.419017,0.0,89833.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048


In [17]:
# sort df_stations by station (Stazione_APPA) then by date (Data)
if "df_stations" not in globals():
    raise NameError("df_stations not found in the notebook")

if not isinstance(df_stations.index, pd.MultiIndex):
    raise ValueError("df_stations must have a MultiIndex with levels ['Data', 'Stazione_APPA']")

# sort by station first, then by date
df_stations = df_stations.sort_index(level=["Stazione_APPA", "Data"])
globals()["df_stations"] = df_stations

# show a quick check
df_stations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PM10_(ug.m-3),Vwind_550hPa,Vwind_850hPa,Vwind_950hPa,blh_mean_daily,Humidity_550hPa,Humidity_850hPa,Humidity_950hPa,Radiaz_Solare_tot_(kJ/m2),Pressione_Atm_(hPa),...,wind_v_10m_Lombardia_ARPAL_023,wind_v_10m_Lombardia_ARPAL_026,wind_v_10m_Lombardia_ARPAL_027,wind_v_10m_Lombardia_ARPAL_028,wind_v_10m_Veneto_502604,wind_v_10m_Veneto_502608,wind_v_10m_Veneto_502609,wind_v_10m_Veneto_502612,wind_v_10m_Veneto_502701,wind_v_10m_Veneto_502720
Data,Stazione_APPA,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-01-01 00:00:00,Borgo Valsugana,2.0,1.119332,-0.578367,-0.846091,11.938321,15.462224,36.544512,70.798897,0.0,88678.2,...,-1.255798,-1.852478,-0.162048,1.582092,-1.469666,-1.51947,-1.667908,-1.51947,-1.627869,-1.287048
2014-01-01 01:00:00,Borgo Valsugana,3.0,1.638383,-0.594175,-0.851948,11.81566,15.023877,36.543976,71.95744,0.0,88645.055,...,-1.16333,-1.840088,-0.024658,1.54956,-1.493408,-1.589111,-1.736572,-1.589111,-1.559815,-1.237549
2014-01-01 02:00:00,Borgo Valsugana,3.0,1.709963,-0.516806,-0.872064,11.902979,18.6947,38.139771,73.813622,0.0,88659.5,...,-1.177872,-1.840958,0.195175,1.56041,-1.522598,-1.641739,-1.81752,-1.641739,-1.556778,-1.210098
2014-01-01 03:00:00,Borgo Valsugana,3.0,1.781543,-0.439437,-0.892179,11.767283,22.365523,39.735565,75.669804,0.0,88638.41,...,-1.256119,-1.842056,0.305405,1.557358,-1.510025,-1.599869,-1.780533,-1.599869,-1.484634,-1.127212
2014-01-01 04:00:00,Borgo Valsugana,3.0,1.853123,-0.362067,-0.912295,11.605534,26.036346,41.33136,77.525986,0.0,88614.64,...,-1.342102,-1.821594,0.10321,1.435242,-1.474915,-1.527649,-1.729797,-1.527649,-1.346985,-0.98761


In [18]:
for col in df_stations.columns:
    print(col)

PM10_(ug.m-3)
Vwind_550hPa
Vwind_850hPa
Vwind_950hPa
blh_mean_daily
Humidity_550hPa
Humidity_850hPa
Humidity_950hPa
Radiaz_Solare_tot_(kJ/m2)
Pressione_Atm_(hPa)
Temperatura_(°C)
Temperature_550hPa
Temperature_850hPa
Temperature_950hPa
Precipitazione_(mm)
Uwind_550hPa
Uwind_850hPa
Uwind_950hPa
Vel_Vento_media_(m/s)
Direzione_Vento_media_(°)
pm10_Alto-Adige_AB2
pm10_Alto-Adige_BR1
pm10_Alto-Adige_BX1
pm10_Alto-Adige_BZ4
pm10_Alto-Adige_BZ5
pm10_Alto-Adige_LA1
pm10_Alto-Adige_LS1
pm10_Alto-Adige_ME1
pm10_Lombardia_ARPAL_001
pm10_Lombardia_ARPAL_002
pm10_Lombardia_ARPAL_005
pm10_Lombardia_ARPAL_007
pm10_Lombardia_ARPAL_008
pm10_Lombardia_ARPAL_011
pm10_Lombardia_ARPAL_012
pm10_Lombardia_ARPAL_017
pm10_Lombardia_ARPAL_019
pm10_Lombardia_ARPAL_020
pm10_Lombardia_ARPAL_022
pm10_Lombardia_ARPAL_023
pm10_Lombardia_ARPAL_026
pm10_Lombardia_ARPAL_027
pm10_Lombardia_ARPAL_028
pm10_Veneto_502604
pm10_Veneto_502608
pm10_Veneto_502609
pm10_Veneto_502612
pm10_Veneto_502701
pm10_Veneto_502720
Vwind_55

In [19]:
stations = {
    "_Alto-Adige_AB2": "_A22_sud,_Termeno,_c/o_Maso_Binnenland",
    "_Alto-Adige_BR1": "_Brunico_1,_c/o_parcheggio_di_via_Goethe",
    "_Alto-Adige_BX1": "_Bressanone,_c/o_Villa_Adele_in_Via_Stazione",
    "_Alto-Adige_BZ4": "_Bolzano_4-Via_C.Augusta",
    "_Alto-Adige_BZ5": "_Bolzano_5-P.zza_Adriano",
    "_Alto-Adige_LA1": "_Laces_1-Via_Stazione,_parcheggio_fs",
    "_Alto-Adige_LS1": "_Laives_1,_c/o_campi_sportivi_zona_Galizia",
    "_Alto-Adige_ME1": "_Merano_1,_Via_Trogman",

    "_Lombardia_ARPAL_001": "_Bergamo-via_Garibaldi_PM10",
    "_Lombardia_ARPAL_002": "_Brescia-Broletto_PM10",
    "_Lombardia_ARPAL_005": "_Calusco_d'Adda_PM10",
    "_Lombardia_ARPAL_007": "_Filago-via_Don_Milani_PM10",
    "_Lombardia_ARPAL_008": "_Lallio_PM10",
    "_Lombardia_ARPAL_011": "_Mantova-p.zza_Gramsci_PM10",
    "_Lombardia_ARPAL_012": "_Mantova-S.Agnese_PM10",
    "_Lombardia_ARPAL_017": "_Osio_Sotto_PM10",
    "_Lombardia_ARPAL_019": "_Parona_PM10",
    "_Lombardia_ARPAL_020": "_Pavia-p.zza_Minerva_PM10",
    "_Lombardia_ARPAL_022": "_Sannazzaro_de'_Burgondi-AGIP_PM10",
    "_Lombardia_ARPAL_023": "_Sarezzo-via_Minelli_PM10",
    "_Lombardia_ARPAL_026": "_Valmadrera_PM10",
    "_Lombardia_ARPAL_027": "_Vigevano-via_Valletta_PM10",
    "_Lombardia_ARPAL_028": "_Voghera-via_Pozzoni_PM10",

    "_Veneto_502604": "_Conegliano",
    "_Veneto_502608": "_TV_Lancieri",
    "_Veneto_502609": "_Mansue",
    "_Veneto_502612": "_TV_S_Agnese",
    "_Veneto_502701": "_Bissuola",
    "_Veneto_502720": "_VE_Tagliamento",
}
def rename_columns_by_mapping(df: pd.DataFrame, mapping: dict, inplace: bool = True) -> pd.DataFrame:
    """
    Rename dataframe columns by replacing any occurrence of a mapping key (code string)
    with its mapped value (name string).

    - Searches for both the exact key (e.g. "_Alto-Adige_AB2") and the key without
      a leading underscore (e.g. "Alto-Adige_AB2") to be a bit more robust.
    - Replaces all occurrences inside the column name (preserves other text).
    - Returns a new DataFrame unless inplace=True.

    Example:
      df = rename_columns_by_mapping(df, stations)
    """
    # sort keys longest-first to avoid partial-match issues
    keys = sorted(mapping.keys(), key=len, reverse=True)

    rename_map = {}
    for col in df_stations.columns:
        new_col = col
        for key in keys:
            # try exact key first
            if key in new_col:
                new_col = new_col.replace(key, mapping[key])
            else:
                # also handle case where column contains the key without leading underscore
                key_no_underscore = key.lstrip('_')
                if key_no_underscore in new_col:
                    new_col = new_col.replace(key_no_underscore, mapping[key])
        if new_col != col:
            rename_map[col] = new_col

    if inplace:
        df_stations.rename(columns=rename_map, inplace=True)
        return df_stations
    else:
        return df_stations.rename(columns=rename_map)
    

df_stations = rename_columns_by_mapping(df_stations, stations)

In [20]:
print(df_stations.shape)

for col in df_stations.columns:
    print(col)


(675024, 600)
PM10_(ug.m-3)
Vwind_550hPa
Vwind_850hPa
Vwind_950hPa
blh_mean_daily
Humidity_550hPa
Humidity_850hPa
Humidity_950hPa
Radiaz_Solare_tot_(kJ/m2)
Pressione_Atm_(hPa)
Temperatura_(°C)
Temperature_550hPa
Temperature_850hPa
Temperature_950hPa
Precipitazione_(mm)
Uwind_550hPa
Uwind_850hPa
Uwind_950hPa
Vel_Vento_media_(m/s)
Direzione_Vento_media_(°)
pm10_A22_sud,_Termeno,_c/o_Maso_Binnenland
pm10_Brunico_1,_c/o_parcheggio_di_via_Goethe
pm10_Bressanone,_c/o_Villa_Adele_in_Via_Stazione
pm10_Bolzano_4-Via_C.Augusta
pm10_Bolzano_5-P.zza_Adriano
pm10_Laces_1-Via_Stazione,_parcheggio_fs
pm10_Laives_1,_c/o_campi_sportivi_zona_Galizia
pm10_Merano_1,_Via_Trogman
pm10_Bergamo-via_Garibaldi_PM10
pm10_Brescia-Broletto_PM10
pm10_Calusco_d'Adda_PM10
pm10_Filago-via_Don_Milani_PM10
pm10_Lallio_PM10
pm10_Mantova-p.zza_Gramsci_PM10
pm10_Mantova-S.Agnese_PM10
pm10_Osio_Sotto_PM10
pm10_Parona_PM10
pm10_Pavia-p.zza_Minerva_PM10
pm10_Sannazzaro_de'_Burgondi-AGIP_PM10
pm10_Sarezzo-via_Minelli_PM10
pm10

In [21]:
lista = []

for col in df_stations.columns:
    if "Trentino" in col:
        lista.append(col)

df_stations = df_stations.drop(columns=lista)

In [22]:
# print(df_stations.shape)


# for col in df_stations.columns:
#     print(col)


In [None]:
# df_stations.to_csv("../data/pm10_era5_land_era5_reanalysis_blh_final.csv")

: 