In [1]:
import pandas as pd
import numpy as np
#import pyarrow as pa
import pyarrow.parquet as pq
import xarray as xr
import glob
import os.path
import datetime
import gc

new_names = ["Longitude","Latitude","DatetimeBegin","SSR"]
st = pq.read_table("../AQ_stations/EEA_stations_meta.parquet").to_pandas()
countries = st.Countrycode.unique().sort_values()
#countries = ['LU', 'AD']
years = range(2015, 2024)

os.chdir("/mnt/cloud/wwu1/ec_bronze/_nogroup/ae78a1ca-a0e8-4e4e-8992-69c34947db65/UseCase_AIRCON")

In [2]:
def extract_ssr(c, y):

    out_pq = f"AQ_data/02_hourly_SSR/{c}_hourly_{y}_gaps_SSR.parquet"
    in_pq = f"AQ_data/01_hourly_gaps/{c}_hourly_2015-2023_gaps.parquet"

    if not os.path.isfile(out_pq) and os.path.isfile(in_pq):
                   
        sel = [("DatetimeBegin", ">", pd.to_datetime(f"{y}-01-01", utc=True)), ("DatetimeBegin", "<", pd.to_datetime(f"{y+1}-01-01", utc=True))]
        aq = pq.read_table(in_pq, filters=sel).to_pandas()
        
        
        ssr_file = glob.glob(f"supplementary/era5_download/*solar*{y}*")
        ssr = xr.open_mfdataset(ssr_file, parallel = True)
        
        # Sort out expver dimension (for recent ERA5T data)
        # https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
        if 'expver' in ssr.dims:
            ssr = ssr.reduce(np.nansum, 'expver')
            #print("\nFound experimental version of SRR data (recent dates). Reducing dimension.")

        # define extract locations in 3D
        target_lon = xr.DataArray(aq["Longitude"].values, dims="points")
        target_lat = xr.DataArray(aq["Latitude"].values, dims="points")
        target_time = xr.DataArray(aq["DatetimeBegin"].values.astype("datetime64[ns]"), dims="points")

        # extract
        ssr_ex = ssr.ssr.sel(longitude = target_lon,
                             latitude = target_lat, 
                             time = target_time,
                             method = "nearest")
        ssr_df = ssr_ex.to_series().rename("SSR")

        result = aq.merge(ssr_df, left_index=True, right_on="points")
        result.to_parquet(out_pq)
        del ssr, aq, ssr_ex, ssr_df, target_lon, target_lat, target_time
        
        gc.collect()
        


In [3]:
for c in countries:
    print(f"\n{c}:", sep=' ', end=' ', flush=True)
    for y in years:
        extract_ssr(c, y)
        print(y, sep=' ', end=' ', flush=True)


AD:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
AL:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
AT:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
BA:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
BE:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
BG:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
CH:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
CY:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
CZ:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
DE:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
DK:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
EE:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
ES:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
FI:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
FR:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
GB:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
GE:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
GI:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
GR:
2015 2016 2017 2018 2019 2020 2021 2022 2023 
HR:
2015 2016 2017 2018 2019 2020 2021 2022 2023 