# Daily Sentinel-5P TROPOMI NO2 data over Europe

This notebook searches and downloads Sentinel-5P data by accessing the STAC cataloge from the [S5P-PAL Data Portal](https://data-portal.s5p-pal.com/apidoc).
Daily mosaics (level 3) are cropped to Europe and warped and to match an existing 1x1 km grid in EPSG:3035. Finally, daily data is aggregated to monthly and annual means and saved as NetCDF files per year.

In [1]:
import os
import glob
import pystac
import pystac_client
import rasterio
import xarray as xr
import requests
from datetime import datetime
import h5netcdf
import dask
from dask.distributed import Client, LocalCluster, progress

download_dir = "../supplementary/sentinel-5p/download"
nc_dir = "../supplementary/sentinel-5p/nc"
daily_dir = "../supplementary/01_daily"
monthly_dir = "../supplementary/02_monthly"
annual_dir = "../supplementary/03_annual"
var = "s5p_no2"
   
cat_url = "https://data-portal.s5p-pal.com/api/s5p-l3"
cat = pystac_client.Client.open(cat_url)

clc = xr.open_dataset("../supplementary/static/CLC_reclass_8_1km.tif", engine="rasterio")

In [2]:
client = Client()

In [3]:
def get_daily_items_per_year(y):
    stac_items = cat.search(
        datetime=f"{y}",
        filter = "l3:quantity='no2' and l3:period='day'",
        max_items = 370
        ).item_collection()
    return stac_items

def process_daily_item(item, verbose = False):
    start_datetime = item.properties['start_datetime']
    d = datetime.strptime(start_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
    out = f"{nc_dir}/S5P_L3_NO2_{d}.nc"
    
    if not os.path.isfile(out) or os.path.getsize(out) < 50000:
        print(f"{d} -", sep=' ', end=' ', flush=True)
        product = item.assets["product"]
        extra_fields = product.extra_fields

        download_url = product.href
        product_local_path = f"{download_dir}/{extra_fields["file:local_path"]}"
        product_size = extra_fields["file:size"]

        if verbose: print("download -", sep=' ', end=' ', flush=True)
        r = requests.get(download_url)
        with open(f"./{product_local_path}", "wb") as product_file:
            product_file.write(r.content)

        if verbose: print("check size -", sep=' ', end=' ', flush=True)
        file_size = os.path.getsize(f"./{product_local_path}")
        assert file_size == product_size
        
        if verbose: print("warp -", sep=' ', end=' ', flush=True)
        ds = xr.open_dataset(product_local_path)
        ds_wgs = ds.rio.write_crs(rasterio.crs.CRS.from_epsg(4326))
        ds_crop = ds_wgs.rio.clip_box(minx=-25, miny=30, maxx=45, maxy=75)
        ds_etrs = ds_crop.rio.reproject_match(clc).assign_coords({
            "x": clc.x,
            "y": clc.y,
            "time": ds_crop.datetime_start,   # assign datetime as index for concatenation later
        })
           
        if verbose: print("write -", sep=' ', end=' ', flush=True)
        #ds_etrs.NO2_column_number_density.rio.to_raster(out, compress="LZW")
        ds_etrs.NO2_column_number_density.rename("NO2_TROPOMI").to_netcdf(out, engine="h5netcdf")
        os.remove(product_local_path)
        if verbose: print("completed.")
    
def aggregate_daily_monthly_annual(var, y):
    out_d = f"{daily_dir}/{var}_daily_{y}.nc"
    out_m = f"{monthly_dir}/{var}_monthly_{y}.nc"
    out_y = f"{annual_dir}/{var}_annual_{y}.nc"
    if not os.path.isfile(out_d) or os.path.getsize(out_d) < 50000:
        var_d = xr.open_mfdataset(f"{nc_dir}/S5P_L3_NO2_{y}*.nc", concat_dim = "time", combine = "nested", chunks = 'auto') 
        
        agg_d = var_d.to_netcdf(out_d, mode="w", engine="h5netcdf", compute=False)
        agg_m = var_d.resample(time="ME").mean(skipna=True).to_netcdf(out_m, mode="w", engine="h5netcdf", compute=False)
        agg_y = var_d.resample(time="YE").mean(skipna=True).to_netcdf(out_y, mode="w", engine="h5netcdf", compute=False)
        return agg_d, agg_m, agg_y


## Download and process daily data

In [5]:
%%time

years = range(2018, 2024)

for y in years:
    
    # fetch STAC items
    item_list = get_daily_items_per_year(y)
    print(f"year = {y}; N_items (days) = {len(item_list)}")

    # iterate and retrieve data
    for i in range(0,len(item_list)):
        process_daily_item(item_list[i])            


year = 2018; N_items (days) = 245
year = 2019; N_items (days) = 365
year = 2020; N_items (days) = 366
year = 2021; N_items (days) = 365
year = 2022; N_items (days) = 365
year = 2023; N_items (days) = 365
CPU times: user 2.29 s, sys: 575 ms, total: 2.86 s
Wall time: 27.5 s


## Reorganize or Aggregate daily to monthly and annual data

In [6]:
years = range(2018, 2024)
lazy_results = []

for y in years:
    tasks = aggregate_daily_monthly_annual(var, y)
    for t in tasks:
        lazy_results.append(t)
    print(y, sep=' ', end=' ', flush=True)

2018 2019 2020 2021 2022 2023 

In [7]:
lazy_results = [x for x in lazy_results if x is not None]
print(len(lazy_results))

reorg_res = dask.persist(*lazy_results)
progress(reorg_res)

18


VBox()

In [9]:
fls = glob.glob(f"{daily_dir}/{var}*")
fls.sort()
for f in fls:
    print(f"{f}: { round ((os.path.getsize (f)/(1024*1024)),1)} mb")

../supplementary/01_daily/s5p_no2_daily_2018.nc: 4025.4 mb
../supplementary/01_daily/s5p_no2_daily_2019.nc: 6080.2 mb
../supplementary/01_daily/s5p_no2_daily_2020.nc: 6344.8 mb
../supplementary/01_daily/s5p_no2_daily_2021.nc: 6173.6 mb
../supplementary/01_daily/s5p_no2_daily_2022.nc: 6305.0 mb
../supplementary/01_daily/s5p_no2_daily_2023.nc: 6322.9 mb


In [12]:
s5p = xr.open_dataset(fls[3])
s5p