### This notebook demonstrates how to download the OPERA DSWx-HLS validation dataset
In particular, the notebook will do the following things:
- Download Planet data used to validate the DSWx-HLS data product and crop it to the appropriate extent
- For each cropped Planet scene, download the hand drawn binary water classification 
- For each Planet scene, download co-incident OPERA DSWx-HLS data

#### NOTES BEFORE RUNNING
1. This notebook expects a co-located '.env' file containing a planet API key in the format 
> PLANET_API_KEY='[key]'

2. This notebook will take a *significant* amount of time to execute (~6-12 hours or more) due to the large volume of data that needs to be downloaded
3. If the notebook fails at the data download stage, re-run the cell. The notebook will resume download from the last successfully downloaded scene

In [None]:
# gis imports
import geopandas as gpd
import rasterio
import rasterio.mask
from rasterio.plot import show
from rasterio.warp import transform_bounds

# planet api imports
from planet import api
from planet.api import downloader
from planet.api.downloader import create

# misc imports
import os
from pathlib import Path
from dotenv import dotenv_values
from tools import addImageCalc
from pathlib import Path

# data science imports
import matplotlib.pyplot as plt
import pandas as pd

# aws imports
import boto3
from botocore.handlers import disable_signing

# pySTAC imports
from pystac_client import Client

os.environ["AWS_NO_SIGN_REQUEST"] = "YES"

In [None]:
# Chip IDs that we will test in this notebook
# these should be chip_ids for which hand-classifications were made

df = pd.read_csv('../data/validation_table.csv')
chip_ids = df.site_name.unique()
print(chip_ids)

In [None]:
df.head()

In [None]:
# Planet data downloader client
PLANET_API_KEY = dotenv_values()['PLANET_API_KEY']

# setup AWS boto3 client
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
s3_client.meta.events.register('choose-signer.s3.*', disable_signing)
s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

In [None]:
imageTable = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
imageTable.head()

In [None]:
image_calcs = gpd.read_file('s3://opera-calval-database-dswx/image_calc.geojson')
image_calcs.head()

In [None]:
df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')

In [None]:
df_images = gpd.read_file('s3://opera-calval-database-dswx/site.geojson')
df_images.head()

In [None]:
# Helper functions

# Given a chip_id, download corresponding Planet imagery
def download_planet_imagery(chip_id):
    """ 
    Given a Planet image id, download associated planetscope imagery. 
    
    If a file already exists at the download location, this function will not overwrite it
    """
    client = api.ClientV1(api_key=PLANET_API_KEY)
    planet_data_downloader = downloader.create(client)

    df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
    df_images.dropna(inplace=True)
    df_images[df_images.site_name == chip_id]

    temp = df_images[['image_name', 'site_name']]
    df_site2image = temp.set_index('site_name')
    df_image2site = temp.set_index('image_name')
    df_site2image.head()

    PLANET_ID = df_images[df_images.site_name == chip_id].image_name.values[0]
    data_dir = Path(f'../data/{PLANET_ID}/')
    data_dir.mkdir(exist_ok=True, parents=True)

    # check if planet data has already been downloaded
    n_planet_images = len(list(data_dir.glob(f"{PLANET_ID}_*AnalyticMS*.tif")))

    if n_planet_images == 0:
        ITEM_TYPE = 'PSScene'
        ASSET_TYPES = ['ortho_analytic_8b_sr', 
                    'ortho_analytic_8b_xml']

        req = client.get_item(ITEM_TYPE, PLANET_ID)
        # activate assets
        resp = req.get()
        if 'ortho_analytic_8b_sr' not in resp['assets']:
            # download 4b_sr if 8b_sr is not available
            ASSET_TYPES = [ 'ortho_analytic_4b_sr', 'ortho_analytic_4b_xml']
        
        items_to_download = [resp] * len(ASSET_TYPES)
        resp_ac = planet_data_downloader.activate(iter(items_to_download), ASSET_TYPES)

        resp_dl = planet_data_downloader.download(iter(items_to_download), ASSET_TYPES, str(data_dir))
    else:
        print(f"Planet images for chip id {chip_id} already exist at {data_dir}. Delete the files to re-download")

    return PLANET_ID
        
# Crop downloaded planet imagery
def crop_planet_imagery(PLANET_ID):
    """
    For a given site_name / planet_id, validation data was generated over a cropped sub-region. This function reads
    the geometry of the cropped region and writes out the cropped image to a separate file.

    If a file already exists at the output location, this function will not overwrite it.
    """
    df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
    df_images.dropna(inplace=True)
    df_site = gpd.read_file('s3://opera-calval-database-dswx/site.geojson')
    df_site.dropna(inplace=True)

    col_list = list(df_images.keys())
    col_list.remove('geometry')
    df_temp = df_images[col_list]
    df_chips = pd.merge(df_site, df_temp , on='site_name', how='left')
    temp = df_chips[['image_name', 'site_name']]
    df_site2image = temp.set_index('site_name')
    df_image2site = temp.set_index('image_name')

    data_dir = Path(f'../data/{PLANET_ID}/')
    data_dir.mkdir(exist_ok=True, parents=True)
    cropped_dir = Path(f'../data/planet_images_cropped/{PLANET_ID}/')
    cropped_file = list(cropped_dir.glob(f"cropped_{PLANET_ID}*.tif"))

    # proceed with cropping planet image only if it hasn't been done already
    if len(cropped_file) == 0:
        cropped_dir.mkdir(exist_ok=True, parents=True)

        planet_image_path = list(data_dir.glob(f'{PLANET_ID}*AnalyticMS*.tif'))[0]
        with rasterio.open(planet_image_path) as ds:
            planet_crs = ds.crs
            planet_profile = ds.profile
        
        if PLANET_ID not in ['20210916_010848_94_2407', '20210924_133812_95_2420', '20210925_072712_16_2254', '20211028_144231_39_227b', '20211030_142613_41_227b']:
            df_chip = df_chips[df_chips.image_name == PLANET_ID]

            # 500 meter buffer
            df_chip_utm = df_chip.to_crs(planet_crs).buffer(500, join_style=2)
        else:
            # For Planet ID == 20210916_010848_94_2407, the cropped geometry specified in s3://opera-calval-database-dswx/site.geojson is incorrect
            # The correct geometry is present in the ../data/validation_table.csv file
            df = pd.read_csv('../data/validation_table.csv')
            df = gpd.GeoDataFrame(df.loc[:, [c for c in df.columns if c != "geometry"]], geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="epsg:4326")
            df_chip_utm = df[df['planet_id'] == PLANET_ID].to_crs(planet_crs)
            
        with rasterio.open(planet_image_path) as src:
            out_image, out_transform = rasterio.mask.mask(src, df_chip_utm.geometry, crop=True)
            out_meta = src.meta

        out_meta.update({"driver": "GTiff",
                "height": out_image.shape[1],
                "width": out_image.shape[2],
                "transform": out_transform,
                "compress": "lzw"})

        with rasterio.open(cropped_dir / f'cropped_{PLANET_ID}.tif', "w", **out_meta) as dest:
            dest.write(out_image)
    else:
        print(f"Cropped image for planet id {PLANET_ID} already exist at {cropped_dir}. Delete the file to re-download")

    return cropped_dir

In [None]:
def main(chip_id):
    
    # download planet data
    planet_id = download_planet_imagery(chip_id)
    
    # crop planet data
    cropped_dir = crop_planet_imagery(planet_id)

In [None]:
# Process all chips
_ = list(map(main, chip_ids))

With data downloaded, create and save a database of all the relevant files needed to expand the validation dataset

In [None]:
df = pd.read_csv("../data/validation_table.csv")

In [None]:
# Let's create a table containing DSWx, Fmask, and RF classification
def return_dswx_path(planet_id):
    data_path = Path('../data')/planet_id/'dswx'
    return ",".join([str(f) for f in list(data_path.glob("OPERA_L3_DSWx*_B01_WTR.tif"))])

def return_conf_path(planet_id):
    data_path = Path('../data')/planet_id/'dswx'
    return ",".join([str(f) for f in list(data_path.glob("OPERA_L3_DSWx*_B03_CONF.tif"))])

def return_val_path(planet_id):
    data_path = Path('../data')/'planet_images_cropped'/planet_id
    return ",".join([str(f) for f in list(data_path.glob(f"site_name-*-classified_planet-*{planet_id}*.tif"))])

df['dswx_files'] = df['planet_id'].map(return_dswx_path)
df['conf_files'] = df['planet_id'].map(return_conf_path)
df['val_files'] = df['planet_id'].map(return_val_path)

# write out table for future use
df_new = df[['site_name', 'planet_id', 'dswx_files', 'conf_files', 'val_files']]
df_new.to_csv('../data/new_validation_table.csv', index=None)

df_new.head()

### Upon successful end-to-end execution of this notebook
1. The `data` folder should be populated by folders containing individual Planet scenes (geotif and metadata), along with the associated OPERA DSWx-HLS data and corresponding HLS Fmask data
2. The `data` folder will also contain a folder named `planet_images_cropped` which will contain folders for the same 52 Planet scenes, with the cropped Planet imagery and associated hand labeled water mask.

For example, for the Planet scene `20210903_150800_60_2458`:
```
    .
    ├── data
    │   ├─ 20210903_150800_60_2458
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_8b_metadata.xml
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_SR_8b.tif
    │   │  ├─ dswx
    │   │  │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B01_WTR.tif
    │   │  │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B03_CONF.tif
    │   │  │  └─ ... 
    │   ├─ ...
    │   ├─ planet_images_cropped
    │   │  ├─ 20210903_150800_60_2458
    │   │  │  ├─ site_name-4_21-classified_planet-20210903_150800_60_2458.tif
    │   │  │  ├─ cropped_20210903_150800_60_2458.tif
    │   │  │  └─ Site-4_21-metadata.json
    │   │  └─ ...
    │   ├─ new_validation_table.csv 
    │   └─ validation_table.csv 
    ├── notebooks
    │   └─ ...
    ├── environment.yml
    └── README.md       
```