In [1]:
import datetime
import gc
import glob
import os.path

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import xarray as xr

os.chdir(
    "/mnt/cloud/wwu1/ec_bronze/_nogroup/ae78a1ca-a0e8-4e4e-8992-69c34947db65/UseCase_AIRCON"
)

new_names = ["Longitude", "Latitude", "Start", "SSR"]
st = pq.read_table("AQ_stations/EEA_stations_meta_table.parquet").to_pandas()
countries = st.Countrycode.unique().sort_values()
# countries = ['AD', 'AL']
st = st[["Air.Quality.Station.EoI.Code", "Longitude", "Latitude"]]
years = range(2015, 2024)

In [2]:
def open_ssr(y):

    ssr_file = glob.glob(f"supplementary/era5_download/*solar*{y}*")
    ssr = xr.open_mfdataset(ssr_file, parallel=True)

    # Sort out expver dimension (for recent ERA5T data)
    # https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
    if "expver" in ssr.dims:
        ssr = ssr.reduce(np.nansum, "expver")
        # print("\nFound experimental version of SRR data (recent dates). Reducing dimension.")
    return ssr


def extract_ssr(c, y, ssr):

    out_pq = f"AQ_data/02_hourly_SSR/{c}_hourly_{y}_gaps_SSR.parquet"
    in_pq = f"AQ_data/01_hourly/{c}_hourly.parquet"

    if not os.path.isfile(out_pq) and os.path.isfile(in_pq):

        sel = [
            ("Start", ">", pd.to_datetime(f"{y}-01-01", utc=False)),
            ("Start", "<", pd.to_datetime(f"{y+1}-01-01", utc=False)),
        ]
        aq = pq.read_table(in_pq, filters=sel).to_pandas()
        aq = pd.merge(aq, st, on="Air.Quality.Station.EoI.Code")

        # define + extract locations in 3D
        target_lon = xr.DataArray(aq["Longitude"].values, dims="points")
        target_lat = xr.DataArray(aq["Latitude"].values, dims="points")
        target_time = xr.DataArray(
            aq["Start"].values.astype("datetime64[ns]"), dims="points"
        )

        # extract
        ssr_ex = ssr.ssr.sel(
            longitude=target_lon,
            latitude=target_lat,
            time=target_time,
            method="nearest",
        )
        ssr_df = ssr_ex.to_series().rename("SSR")

        result = aq.merge(ssr_df, left_index=True, right_on="points")
        result.reset_index(drop=True, inplace=True)
        result.to_parquet(out_pq)   
        del aq, ssr_ex, ssr_df, target_lon, target_lat, target_time

        gc.collect()

In [3]:
#%%time

# Test
#for y in years:
#    ssr = open_ssr(y)
#    extract_ssr("AD", y, ssr)

In [4]:
for y in years:
    ssr = open_ssr(y)
    print(f"\n{y}:", sep=" ", end=" ", flush=True)
    for c in countries:
        extract_ssr(c, y, ssr)
        print(c, sep=" ", end=" ", flush=True)


2015: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2016: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2017: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2018: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2019: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2020: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2021: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK MT NL NO PL PT RO RS SE SI SK TR UA XK 
2022: AD AL AT BA BE BG CH CY CZ DE DK EE ES FI FR GB GE GR HR HU IE IS IT LT LU LV ME MK