#### Download Planet data, crop it, and also download associated hand drawn classifications. Also download co-incident OPERA DSWx data

This notebook expects a co-located '.env' file containing a planet API key in the format 
> PLANET_API_KEY='[key]'

In [1]:
# gis imports
import geopandas as gpd
import rasterio
import rasterio.mask
from rasterio.plot import show
from rasterio.warp import transform_bounds

# planet api imports
from planet import api
from planet.api import downloader
from planet.api.downloader import create

# misc imports
import os
from pathlib import Path
from dotenv import dotenv_values
from tools import addImageCalc
from pathlib import Path

# data science imports
import matplotlib.pyplot as plt
import pandas as pd

# aws imports
import boto3
from botocore.handlers import disable_signing

# pySTAC imports
from pystac_client import Client

os.environ["AWS_NO_SIGN_REQUEST"] = "YES"

  from planet import api


In [2]:
# Chip IDs that we will test in this notebook
# these should be chip_ids for which hand-classifications were made

df = pd.read_csv('../data/validation_table.csv')
chip_ids = df.site_name.unique()
print(chip_ids)

['4_21' '4_11' '1_31' '3_28' '1_37' '1_34' '3_30' '4_8' '1_43' '2_28'
 '4_9' '1_9' '1_38' '1_41' '4_5' '4_22' '4_14' '2_26' '4_42' '4_37' '4_6'
 '3_1' '3_15' '1_32' '2_1' '3_43' '2_15' '2_7' '1_47' '4_26' '3_40' '2_13'
 '1_19' '2_9' '3_8' '4_1' '3_10' '1_18' '2_4' '4_28' '1_15' '3_3' '3_4'
 '3_5' '4_7' '2_38' '1_1' '3_12' '3_32' '1_5' '2_29' '2_8']


In [3]:
df.head()

Unnamed: 0,site_name,planet_id,dswx_id,hls_id,dswx_urls,validation_dataset_url,water_stratum,geometry
0,4_21,20210903_150800_60_2458,OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_2023...,HLS.L30.T18UXG.2021245T154154.v2.0,https://opera-pst-rs-pop1.s3.us-west-2.amazona...,https://opera-calval-database-dswx.s3.us-west-...,3.0,"POLYGON ((-71.870513357149 55.11001696376937, ..."
1,4_11,20210903_152641_60_105c,OPERA_L3_DSWx-HLS_T19UDA_20210902T154911Z_2023...,HLS.S30.T19UDA.2021245T154911.v2.0,https://opera-pst-rs-pop1.s3.us-west-2.amazona...,https://opera-calval-database-dswx.s3.us-west-...,3.0,POLYGON ((-69.17307071901621 54.40592422230064...
2,1_31,20210904_093422_44_1065,OPERA_L3_DSWx-HLS_T33JYG_20210905T082559Z_2023...,HLS.S30.T33JYG.2021248T082559.v2.0,https://opera-pst-rs-pop1.s3.us-west-2.amazona...,https://opera-calval-database-dswx.s3.us-west-...,1.0,POLYGON ((17.282441488515342 -29.9714135761361...
3,3_28,20210906_101112_28_225a,OPERA_L3_DSWx-HLS_T30TYN_20210905T105621Z_2023...,HLS.S30.T30TYN.2021248T105621.v2.0,https://opera-pst-rs-pop1.s3.us-west-2.amazona...,https://opera-calval-database-dswx.s3.us-west-...,2.0,POLYGON ((-0.0438908706972531 43.0523272022019...
4,1_37,20210909_000649_94_222b,OPERA_L3_DSWx-HLS_T54JTM_20210908T003848Z_2023...,HLS.L30.T54JTM.2021251T003848.v2.0,https://opera-pst-rs-pop1.s3.us-west-2.amazona...,https://opera-calval-database-dswx.s3.us-west-...,1.0,POLYGON ((138.25958887036043 -30.3281075679621...


In [4]:
# Planet data downloader client
PLANET_API_KEY = dotenv_values()['PLANET_API_KEY']

# setup AWS boto3 client
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
s3_client.meta.events.register('choose-signer.s3.*', disable_signing)
s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

In [8]:
imageTable = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
imageTable.head()

Unnamed: 0,Strata,cloud_cover,collocated_dswx,datetime,dswx_cloud_cover,image_name,instrument,provider,resolution,site_coverage,site_name,timeDelta_days,geometry
0,3.0,0.0,HLS.S30.T04WDB.2021271T225541.v2.0,2021-09-29 21:28:09,4.0,20210929_212809_16_2442,PSB.SD,planetscope,3.0,90.820013,3_38,0.938606,"POLYGON ((-160.00896 69.30478, -160.21043 69.1..."
1,4.0,0.0,HLS.S30.T33WXP.2021270T104719.v2.0,2021-09-27 09:55:37,16.0,20210927_095537_0f15,PS2,planetscope,3.0,41.216128,4_43,0.039303,"POLYGON ((17.22275 66.09223, 17.16459 66.02168..."
2,3.0,0.0,HLS.L30.T40WEU.2021268T073033.v2.0,2021-09-25 07:27:12,2.0,20210925_072712_16_2254,PSB.SD,planetscope,3.0,72.27253,3_1,0.002335,"POLYGON ((59.00446 65.95517, 58.85023 65.76646..."
3,4.0,0.0,HLS.S30.T14VPQ.2021244T175911.v2.0,2021-09-01 17:52:05,2.0,20210901_175205_71_2426,PSB.SD,planetscope,3.0,100.0,4_34,0.01,"POLYGON ((-96.94381 62.88876, -97.07609 62.707..."
4,3.0,0.0,HLS.S30.T52VFP.2021253T023549.v2.0,2021-09-11 00:51:29,2.0,20210911_005129_82_106a,PS2.SD,planetscope,3.0,100.0,3_30,0.924389,"POLYGON ((130.86458 61.50211, 130.95358 61.357..."


In [9]:
image_calcs = gpd.read_file('s3://opera-calval-database-dswx/image_calc.geojson')
image_calcs.head()

Unnamed: 0,bucket,calc_type,calculated_by,image_calc_name,image_name,notes,oversight_level,previous_name,processing_level,public,reviewed_by,s3_keys,upload_date,version,water_strata,water_stratum,geometry
0,opera-calval-database-dswx,Supervised Classification,Alexander Handwerger,20211021_133031_75_245a_class,20211021_133031_75_245a,,,,Intermediate,True,,data/site/3_4/image/20211021_133031_75_245a/im...,20220802_145632,,,,"POLYGON ((-64.30053 -33.07480, -64.34944 -33.2..."
1,,,,,,,,,,,,,,,,,
2,opera-calval-database-dswx,Manual Edit of Classification,Alexander Handwerger,20211021_133031_75_245a_class_edit,20211021_133031_75_245a,,,20211021_133031_75_245a_class,Intermediate,True,,data/site/3_4/image/20211021_133031_75_245a/im...,20220802_161902,,,,"POLYGON ((-64.30053 -33.07480, -64.34944 -33.2..."
3,opera-calval-database-dswx,Review,Alexander Handwerger,20211021_133031_75_245a_class_edit_review,20211021_133031_75_245a,,Reviewed-Complete,20211021_133031_75_245a_class_edit,Final,True,Matthew Bonnema,data/site/3_4/image/20211021_133031_75_245a/im...,20220803_104213,,,,"POLYGON ((-64.30053 -33.07480, -64.34944 -33.2..."
4,opera-calval-database-dswx,Supervised Classification,Alexander Handwerger,20210924_082025_48_2424_class,20210924_082025_48_2424,,,,Intermediate,True,,data/site/4_37/image/20210924_082025_48_2424/i...,20220805_102655,,,,"POLYGON ((28.20956 -9.14523, 28.17025 -9.34005..."


In [None]:
df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')

In [7]:
df_images = gpd.read_file('s3://opera-calval-database-dswx/site.geojson')
df_images.head()

Unnamed: 0,inundated_vegetation_crop,inundated_vegetation_orig,percent_inundated_vegetation_crop,percent_inundated_vegetation_orig,percent_total_water_crop,percent_total_water_orig,percent_water_crop,percent_water_orig,site_name,total_pixels_crop,total_pixels_orig,water_pixels_crop,water_pixels_orig,geometry
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3_38,250000,1252157,0.0,0.0,"POLYGON ((-159.66522 69.26009, -159.66385 69.2..."
1,3701.0,8919.0,1.4804,0.712673,33.0248,23.612029,31.5444,22.899356,4_27,250000,1251485,78861.0,286582.0,"POLYGON ((-113.00031 67.04574, -112.99662 67.0..."
2,2307.0,41908.0,0.9228,3.367619,78.0388,48.05527,77.116,44.687651,4_43,250000,1244440,192790.0,556111.0,"POLYGON ((17.81813 65.99817, 17.81319 65.95337..."
3,41560.0,246974.0,16.624,19.844394,19.5324,21.262333,2.9084,1.417939,3_1,250000,1244553,7271.0,17647.0,"POLYGON ((59.78286 65.84086, 59.77802 65.79605..."
4,69078.0,141601.0,27.6312,11.350572,38.01,18.365513,10.3788,7.014941,4_34,250000,1247523,25947.0,87513.0,"POLYGON ((-96.51418 62.72101, -96.51794 62.676..."


In [None]:
# Helper functions

# Given a chip_id, download corresponding Planet imagery
def download_planet_imagery(chip_id):
    """ 
    Given a Planet image id, download associated planetscope imagery. 
    
    If a file already exists at the download location, this function will not overwrite it
    """
    client = api.ClientV1(api_key=PLANET_API_KEY)
    planet_data_downloader = downloader.create(client)

    df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
    df_images.dropna(inplace=True)
    df_images[df_images.site_name == chip_id]

    temp = df_images[['image_name', 'site_name']]
    df_site2image = temp.set_index('site_name')
    df_image2site = temp.set_index('image_name')
    df_site2image.head()

    PLANET_ID = df_images[df_images.site_name == chip_id].image_name.values[0]
    data_dir = Path(f'../data/{PLANET_ID}/')
    data_dir.mkdir(exist_ok=True, parents=True)

    # check if planet data has already been downloaded
    n_planet_images = len(list(data_dir.glob(f"{PLANET_ID}_*AnalyticMS*.tif")))

    if n_planet_images == 0:
        ITEM_TYPE = 'PSScene'
        ASSET_TYPES = ['ortho_analytic_8b_sr', 
                    'ortho_analytic_8b_xml']

        req = client.get_item(ITEM_TYPE, PLANET_ID)
        # activate assets
        resp = req.get()
        if 'ortho_analytic_8b_sr' not in resp['assets']:
            # download 4b_sr if 8b_sr is not available
            ASSET_TYPES = [ 'ortho_analytic_4b_sr', 'ortho_analytic_4b_xml']
        
        items_to_download = [resp] * len(ASSET_TYPES)
        resp_ac = planet_data_downloader.activate(iter(items_to_download), ASSET_TYPES)

        resp_dl = planet_data_downloader.download(iter(items_to_download), ASSET_TYPES, str(data_dir))
    else:
        print(f"Planet images for chip id {chip_id} already exist at {data_dir}. Delete the files to re-download")

    return PLANET_ID
        
# Crop downloaded planet imagery
def crop_planet_imagery(PLANET_ID):
    """
    For a given site_name / planet_id, validation data was generated over a cropped sub-region. This function reads
    the geometry of the cropped region and writes out the cropped image to a separate file.

    If a file already exists at the output location, this function will not overwrite it.
    """
    df_images = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
    df_images.dropna(inplace=True)
    df_site = gpd.read_file('s3://opera-calval-database-dswx/site.geojson')
    df_site.dropna(inplace=True)

    col_list = list(df_images.keys())
    col_list.remove('geometry')
    df_temp = df_images[col_list]
    df_chips = pd.merge(df_site, df_temp , on='site_name', how='left')
    temp = df_chips[['image_name', 'site_name']]
    df_site2image = temp.set_index('site_name')
    df_image2site = temp.set_index('image_name')

    data_dir = Path(f'../data/{PLANET_ID}/')
    data_dir.mkdir(exist_ok=True, parents=True)
    cropped_dir = Path(f'../data/planet_images_cropped/{PLANET_ID}/')
    cropped_file = list(cropped_dir.glob(f"cropped_{PLANET_ID}*.tif"))

    # proceed with cropping planet image only if it hasn't been done already
    if len(cropped_file) == 0:
        cropped_dir.mkdir(exist_ok=True, parents=True)

        planet_image_path = list(data_dir.glob(f'{PLANET_ID}*AnalyticMS*.tif'))[0]
        with rasterio.open(planet_image_path) as ds:
            planet_crs = ds.crs
            planet_profile = ds.profile
        
        if PLANET_ID not in ['20210916_010848_94_2407', '20210924_133812_95_2420', '20210925_072712_16_2254', '20211028_144231_39_227b', '20211030_142613_41_227b']:
            df_chip = q[df_chips.image_name == PLANET_ID]

            # 500 meter buffer
            df_chip_utm = df_chip.to_crs(planet_crs).buffer(500, join_style=2)
        else:
            # For Planet ID == 20210916_010848_94_2407, the cropped geometry specified in s3://opera-calval-database-dswx/site.geojson is incorrect
            # The correct geometry is present in the ../data/validation_table.csv file
            df = pd.read_csv('../data/validation_table.csv')
            df = gpd.GeoDataFrame(df.loc[:, [c for c in df.columns if c != "geometry"]], geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="epsg:4326")
            df_chip_utm = df[df['planet_id'] == PLANET_ID].to_crs(planet_crs)
            
        with rasterio.open(planet_image_path) as src:
            out_image, out_transform = rasterio.mask.mask(src, df_chip_utm.geometry, crop=True)
            out_meta = src.meta

        out_meta.update({"driver": "GTiff",
                "height": out_image.shape[1],
                "width": out_image.shape[2],
                "transform": out_transform,
                "compress": "lzw"})

        with rasterio.open(cropped_dir / f'cropped_{PLANET_ID}.tif', "w", **out_meta) as dest:
            dest.write(out_image)
    else:
        print(f"Cropped image for planet id {PLANET_ID} already exist at {cropped_dir}. Delete the file to re-download")

    return cropped_dir

# For a given Planet ID, download hand labeled validation data
def download_validation_data(PLANET_ID, cropped_dir):
    """
    For a given planet_id, download the validation data from the OPERA Cal-Val S3 bucket. The location of the validation 
    data is obtained from the validation_table.csv file.

    If a file already exists at the download location, this function will not overwrite it.
    """
    classification_file = list(cropped_dir.glob(f"classification_{PLANET_ID}*.tif"))
    if len(classification_file) == 0:

        imageTable = gpd.read_file('s3://opera-calval-database-dswx/image.geojson')
        image_calcs = gpd.read_file('s3://opera-calval-database-dswx/image_calc.geojson')

        download_dir = Path(f'../data/planet_images_cropped/{PLANET_ID}').absolute()
        download_dir.mkdir(exist_ok=True, parents=True)

        def downloadImage_calc(row,download_dir):
            bucket = row.bucket.iloc[0]
            keys = row.s3_keys.iloc[0]
            keys = keys.split(',')
            for key in keys:
                filename = key.split('/')[-1]
                response = s3_client.download_file(bucket,
                                                key,
                                                str(download_dir / filename))

        search = image_calcs[image_calcs.image_name == PLANET_ID]

        try:
            search_iter = search[search.version==search['version'].max()]
            search_iter = search_iter.iloc[[0]]
        except IndexError:
            search_iter = search[search.upload_date.values==search.upload_date.values.max()]
            imagecalc_row = search_iter

        imagecalc_row = search_iter
        image_calc_name = imagecalc_row.image_calc_name.iloc[0]
        version = imagecalc_row.version.iloc[0]
        imagecalc_row.to_file(download_dir / f'metadata_{PLANET_ID}_v{version}.geojson', driver='GeoJSON')
        downloadImage_calc(imagecalc_row,download_dir)
    else:
        print(f"Validation data for planet id {PLANET_ID} already exist at {cropped_dir}. Delete the files to re-download")

In [None]:
# Helper functions to process DSWx data

# return Fmask url corresponding to HLS tile 
def get_fmask_url(hls_id: str) -> str:
    BANDS = ['Fmask']
    STAC_URL = 'https://cmr.earthdata.nasa.gov/stac'

    api = Client.open(f'{STAC_URL}/LPCLOUD/')
    hls_collections = ['HLSL30.v2.0', 'HLSS30.v2.0']

    search_params = {"collections": hls_collections, "ids": hls_id}
    search_hls = api.search(**search_params)

    assert(search_hls.matched() == 1)
    hls_collection = list(search_hls.get_all_items())
    metadata = hls_collection[0].to_dict()
    urls = [metadata['assets'].get(band, {'href': ''})['href'] for band in BANDS]
    return urls[0]

# Download DSWx data
def download_dswx_data(planet_id):
    save_path = Path('../data') / planet_id
    df = pd.read_csv('../data/validation_table.csv')
    row = df[df['planet_id'] == planet_id]
  
    dswx_file = row.dswx_urls.values[0].split()[0]
    fmask_file = get_fmask_url(row.hls_id.values[0])
    conf_file = row.dswx_urls.values[0].split()[2]

    os.system(f"wget {dswx_file} -q -nc -P {save_path}")
    os.system(f"wget {fmask_file} -q -nc -P {save_path}")
    os.system(f"wget {conf_file} -q -nc -P {save_path}")   

In [None]:
def main(chip_id):
    
    # download planet data
    planet_id = download_planet_imagery(chip_id)
    
    # crop planet data
    cropped_dir = crop_planet_imagery(planet_id)
    
    # download validation data
    download_validation_data(planet_id, cropped_dir)

    # download overlapping DSWx tile
    download_dswx_data(planet_id)

In [None]:
# Process all chips
_ = list(map(main, chip_ids))

With data downloaded, create and save a database of all the relevant files needed to expand the validation dataset

In [None]:
df = pd.read_csv("../data/new_validation_table.csv")

In [None]:
# Let's create a table containing DSWx, Fmask, and RF classification
def return_dswx_path(planet_id):
    data_path = Path('../data')/planet_id
    return ",".join([str(f) for f in list(data_path.glob("OPERA_L3_DSWx*_B01_WTR.tif"))])

def return_fmask_path(planet_id):
    data_path = Path('../data')/planet_id
    return ",".join([str(f) for f in list(data_path.glob("HLS*Fmask.tif"))])

def return_conf_path(planet_id):
    data_path = Path('../data')/planet_id
    return ",".join([str(f) for f in list(data_path.glob("OPERA_L3_DSWx*_B03_CONF.tif"))])

def return_classification_path(planet_id):
    data_path = Path('../data')/'planet_images_cropped'/planet_id
    return ",".join([str(f) for f in list(data_path.glob("full_img_rf_classification_*.tif"))])

def return_val_path(planet_id):
    data_path = Path('../data')/'planet_images_cropped'/planet_id
    return ",".join([str(f) for f in list(data_path.glob(f"classification_*{planet_id}*.tif"))])

df['dswx_files'] = df['planet_id'].map(return_dswx_path)
df['fmask_files'] = df['planet_id'].map(return_fmask_path)
df['conf_files'] = df['planet_id'].map(return_conf_path)
df['rf_classification_files'] = df['planet_id'].map(return_classification_path)
df['val_files'] = df['planet_id'].map(return_val_path)

# write out table for future use
df_new = df[['site_name', 'planet_id', 'dswx_files', 'fmask_files', 'conf_files', 'val_files', 'rf_classification_files']]
df_new.to_csv('../data/new_validation_table.csv', index=None)

df_new.head()