# Workflow to create and maintain the validation table for CSLC Cal/Val

This notebook demonstrates how to query the public S3 bucket hosting granules associated with bursts identified for CSLC Cal/Val activities.

Specifically the motivation here to access + record the preqrequiste information from each granule in order to virtually perform Cal/Val analyses.

For reference, the following resources were used to help create this notebook:
https://alexwlchan.net/2017/listing-s3-keys/
https://github.com/boto/boto3/issues/1200

## Load prerequisite modules

In [1]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config

import fsspec

import geopandas as gpd

import h5py

import pandas as pd

from pathlib import Path

import shapely.wkt as wkt

## Static variables that identify S3 paths to data

<div class="alert alert-warning">
Only change IF you know what you are doing (i.e. itentional changes to reflect hypothetical migration of validation data).
</div>

In [2]:
# set S3 path variables
bucket = 'opera-pst-rs-pop1'
prefix = 'products/CSLC_S1'
suffix = 'Z.h5'
s3_path = f's3://{bucket}'
DATA_ROOT = 'science/SENTINEL1'
#DATA_ROOT = 'data'
id_path = f'{DATA_ROOT}/identification'

## Load function to query S3 bucket

In [3]:
def get_matching_s3_keys(bucket, prefix='', suffix='', burstId=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix.
    :param suffix: Only fetch keys that end with this suffix.
    :param burstId: Only fetch keys that match burstId.
    """
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:
        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            if key.startswith(prefix) and key.endswith(suffix) \
                 and burstId in key:
                yield key

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

    return

## Access or initiate validation table containing links to S3 bucket

In [4]:
# access dataframe from file, if it exists
validation_csv = Path('validation_table.csv')
if validation_csv.is_file():
    df = pd.read_csv(validation_csv)
    validation_bursts_df = gpd.GeoDataFrame(
        df.loc[:, [c for c in df.columns if c != "geometry"]],
        geometry=gpd.GeoSeries.from_wkt(df["geometry"])
        )
else:
    # otherwise, initialize dataframe
    validation_bursts_df = gpd.GeoDataFrame()
    # add placeholder columns
    validation_bursts_df['burst_id'] = None
    validation_bursts_df['date'] = None
    validation_bursts_df['cslc_url'] = None
    validation_bursts_df['cslc_static_url'] = None

## Access premade, static table containing all bursts identified for CSLC Cal/Val activities

In [5]:
# read list of bursts used for validation
validation_bursts = Path('validation_bursts.csv')
if validation_bursts.is_file():
    df = pd.read_csv(validation_bursts)
    burstId_list = df['burst_id'].to_list()
else:
    raise Exception(f'Expected burst record {validation_bursts.absolute()} '
                    'not found. Check working directory.')

## Query server and build up validation table

In [6]:
# query products on S3 bucket
for burstId in burstId_list:
    # adjust burst strings to reflect product name convention
    query_burstId = burstId.upper().replace('_','-')
    print(f'Querying server for burst id {burstId}')
    for key in get_matching_s3_keys(bucket=bucket,
                                    prefix=prefix,
                                    suffix=suffix,
                                    burstId=query_burstId):
        # only proceed if file not already captured in records
        # and only if there is a valid corresponding static layer
        cslc_url = f'{s3_path}/{key}'
        cslc_static_url = Path(cslc_url)
        cslc_static_url = str(Path(str(cslc_static_url.parent) + \
                          '_static_layers/' + \
                          cslc_static_url.name[:-3] + '_static_layers.h5'))
        cslc_static_url = cslc_static_url[:3] + '/' + cslc_static_url[3:]

        if cslc_url not in validation_bursts_df['cslc_url'].values.astype(str) \
            and '_v0.0_202303' not in cslc_static_url:
            # get date
            file_path = Path(key)
            filename = file_path.name
            date = filename.split('_')[-3][:8]

            # check if geometry for burst already in df
            geom_check = validation_bursts_df['burst_id'] == burstId
            idx_geo = next(iter(geom_check.index[geom_check]), False)
            if idx_geo != False:
                geometry = validation_bursts_df.loc[idx_geo]['geometry']
            else:
                # otherwise, read file to access geometry
                s3f = fsspec.open(cslc_url, mode='rb', anon=True,
                                  default_fill_cache=False)
                with h5py.File(s3f.open(),'r') as h5:
                    geometry = h5[f'{id_path}/'
                                  'bounding_polygon'][()].astype(str)
                geometry = wkt.loads(geometry)

            # create dictionary for this file
            file_dict = {'burst_id': burstId,
                         'date': date,
                         'cslc_url': cslc_url,
                         'cslc_static_url': cslc_static_url,
                         'geometry': geometry}
            # append to records
            validation_bursts_df = pd.concat([validation_bursts_df, \
                gpd.GeoDataFrame([file_dict])], ignore_index=True)

    print('Df status', validation_bursts_df)

Querying server for burst id t064_135523_iw2
Df status             burst_id      date   
0    t064_135523_iw2  20141221  \
1    t064_135523_iw2  20150114   
2    t064_135523_iw2  20150207   
3    t064_135523_iw2  20150327   
4    t064_135523_iw2  20150502   
..               ...       ...   
289  t064_135523_iw2  20211027   
290  t064_135523_iw2  20211108   
291  t064_135523_iw2  20211120   
292  t064_135523_iw2  20211202   
293  t064_135523_iw2  20211214   

                                              cslc_url   
0    s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...  \
1    s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
2    s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
3    s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
4    s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
..                                                 ...   
289  s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
290  s3://opera-pst-rs-pop1/products/CSLC_S1/OPERA_...   
291  s3://oper

## Save validation table

In [7]:
# sort by time
validation_bursts_df = validation_bursts_df.sort_values(by=['burst_id', 'date'], ascending=[True, True])
# save table to file
validation_bursts_df.to_csv('validation_table.csv', index=False)