# Workflow to create and maintain the validation table for CSLC Cal/Val

This notebook demonstrates how to query the public S3 bucket hosting granules associated with bursts identified for CSLC Cal/Val activities.

Specifically the motivation here to access + record the preqrequiste information from each granule in order to virtually perform Cal/Val analyses.

For reference, the following resources were used to help create this notebook:
https://alexwlchan.net/2017/listing-s3-keys/
https://github.com/boto/boto3/issues/1200

## Load prerequisite modules

In [1]:
from copy import deepcopy
from pathlib import Path

import boto3
from botocore import UNSIGNED
from botocore.config import Config

import fsspec

import geopandas as gpd

import h5py

import pandas as pd

import shapely.wkt as wkt

## Static variables that identify S3 paths to data

<div class="alert alert-warning">
Only change IF you know what you are doing (i.e. itentional changes to reflect hypothetical migration of validation data).
</div>

In [2]:
# set S3 path variables
bucket = 'opera-provisional-products'
prefix = 'CSLC/pst_adt_common/az_fm_rate'
with_or_withoutfmrate = 'no_az_fm_rate'
prefix = f'{prefix}/{with_or_withoutfmrate}'

s3_path = f's3://{bucket}'

# track specific version and corresponding static layer
version_num = '1.0'
suffix = '.h5'
id_path = f'identification'

# Set name of output containing all burst products for streaming
validation_bursts = Path(f'noazfmrate_validation_bursts_target_v{version_num}.csv')

validation_csv = Path(f'noazfmrate_table_validation_bursts_target_v{version_num}.csv')

## Load function to query S3 bucket

In [3]:
def get_matching_s3_keys(bucket, prefix='', suffix='', burst_id='', version_num=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix.
    :param suffix: Only fetch keys that end with this suffix.
    :param burst_id: Only fetch keys that match burst_id.
    :param burst_id: Only fetch keys that match version num.
    """
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    # Efficiently capture only matches that share relevant filename prefix
    if isinstance(prefix, str):
        kwargs['Prefix'] = f'{prefix}/{burst_id}/20'

    # Max query for `list_object_v2` is only 1000
    # Need to institute while loop to circumvent
    while True:
        # 'Contents' contains information about the listed objects
        # Sort by last modified in order to get most recent static layer
        resp = s3.list_objects_v2(**kwargs)

        try:
            valid_results = resp['Contents']
        except KeyError: 
            break
            
        get_last_modified = lambda valid_results: int(valid_results['LastModified'].strftime('%s'))
        valid_results = [obj['Key'] for obj in sorted(valid_results, key=get_last_modified)]

        # filter by suffix, version number, and remove legacy products
        valid_results = [i for i in valid_results if i.endswith(suffix)]
        for key in valid_results:
            yield key

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

    return

## Access or initiate validation table containing links to S3 bucket

In [4]:
# access dataframe from file, if it exists
if validation_csv.is_file():
    df = pd.read_csv(validation_csv)
    validation_bursts_df = gpd.GeoDataFrame(
        df.loc[:, [c for c in df.columns if c != "geometry"]],
        geometry=gpd.GeoSeries.from_wkt(df["geometry"])
        )
else:
    # otherwise, initialize dataframe
    validation_bursts_df = gpd.GeoDataFrame()
    # add placeholder columns
    validation_bursts_df['burst_id'] = None
    validation_bursts_df['date'] = None
    validation_bursts_df['cslc_url'] = None
    validation_bursts_df['cslc_static_url'] = None

## Access premade, static table containing all bursts identified for CSLC Cal/Val activities

In [5]:
# check existence of burst file
if not validation_bursts.is_file():
    raise Exception(f'Expected burst record {validation_bursts.absolute()} '
                    'not found. Check working directory.')

## Query server and build up validation table

In [6]:
# query products on S3 bucket
df_val = pd.read_csv(validation_bursts)

# no static layer expected
cslc_static_url = None

# define temp file name and initiate boto3 client
temp_path = 'tmp_local_burst.h5'
s3_tmp = boto3.client('s3', config=Config(signature_version=UNSIGNED))

for df_ind in df_val.index:
    # get index values
    burst_id = df_val['burst_id'][df_ind]
    cr_network = df_val['cr_network'][df_ind]

    # redefine prefix
    iter_prefix = f'{prefix}/{cr_network}'

    print(f'BurstID: {burst_id}')
    
    for key in get_matching_s3_keys(bucket=bucket,
                                    prefix=iter_prefix,
                                    suffix=suffix,
                                    burst_id=burst_id,
                                    version_num=version_num):
        # only proceed if file not already captured in records
        # and only if there is a valid corresponding static layer
        cslc_url = f'{s3_path}/{key}'

        if cslc_url not in validation_bursts_df['cslc_url'].values.astype(str):
            # get date
            date = key.split('/')[-2]

            # check if geometry for burst already in df
            geom_check = validation_bursts_df['burst_id'] == burst_id
            idx_geo = next(iter(geom_check.index[geom_check]), False)
            if idx_geo != False:
                geometry = validation_bursts_df.loc[idx_geo]['geometry']
            else:
                print(f'BurstID: {burst_id}, cslc_url: {cslc_url}')
                # otherwise, read file to access geometry
                # may need to dwnl raster first to circumvent s3 access issues
                #!#if Path(temp_path).exists():
                #!#    Path(temp_path).unlink()
                #!#s3_tmp.download_file(bucket, key, temp_path)
                # otherwise, read file to access geometry
                s3f = fsspec.open(cslc_url, mode='rb', anon=True,
                                  default_fill_cache=False)
                with h5py.File(s3f.open(),'r') as h5:
                    geometry = h5[f'{id_path}/'
                                  'bounding_polygon'][()].astype(str)
                geometry = wkt.loads(geometry)
                # delete temp file
                #!#Path(temp_path).unlink()

            # create dictionary for this file
            file_dict = {'burst_id': burst_id,
                         'date': date,
                         'cslc_url': cslc_url,
                         'cslc_static_url': cslc_static_url,
                         'geometry': geometry}
            # append to records
            validation_bursts_df = pd.concat([validation_bursts_df, \
                gpd.GeoDataFrame([file_dict])], ignore_index=True)

BurstID: t142_303931_iw3
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_fm_rate/Ecuador/t142_303931_iw3/20141015/t142_303931_iw3_20141015.h5
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_fm_rate/Ecuador/t142_303931_iw3/20141108/t142_303931_iw3_20141108.h5
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_fm_rate/Ecuador/t142_303931_iw3/20141202/t142_303931_iw3_20141202.h5
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_fm_rate/Ecuador/t142_303931_iw3/20141226/t142_303931_iw3_20141226.h5
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_fm_rate/Ecuador/t142_303931_iw3/20150119/t142_303931_iw3_20150119.h5
BurstID: t142_303931_iw3, cslc_url: s3://opera-provisional-products/CSLC/pst_adt_common/az_fm_rate/no_az_

## Save validation table

In [7]:
#removing duplcates
validation_bursts_df  = validation_bursts_df.drop_duplicates(['burst_id', 'date'])

# sort by time
validation_bursts_df = validation_bursts_df.sort_values(by=['burst_id', 'date'], ascending=[True, True]).reset_index(drop=True)

# save table to file
validation_bursts_df.to_csv(validation_csv, index=False)