In [13]:
from cmr import GranuleQuery, CollectionQuery, VariableQuery
import datetime
from shapely.geometry import Polygon
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from osgeo import gdal
import rasterio
import concurrent.futures
import backoff
from rasterio.errors import RasterioIOError
import requests
import boto3
from rasterio.session import AWSSession
import os
import numpy as np
from pathlib import Path

# Linking DSWx from HLS

HLS has wonky datestrings

Automating this url: https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=*T18GYP*20230410T142428Z*&options[granule_ur][pattern]=true

In [43]:
base_url = 'https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&'

sample_hls_id_s2a = 'HLS.S30.T25MER.2023124T123259.v2.0'
sample_hls_id_l8 = 'HLS.L30.T18TWM.2023125T153242.v2.0'

podaac_url = 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/OPERA_L3_DSWX-HLS_PROVISIONAL_V1'
              

def format_hls_dt(dt_hls: str) -> str:
    date, time = dt_hls.split('T')
    year = date[:4]
    year_start = datetime.datetime(int(year), 1, 1)
    day_offset = datetime.timedelta(days=(int(date[4:]) - 1))
    date = year_start + day_offset
    date_str_f = date.strftime('%Y%m%d')
    return f'{date_str_f}T{time}'
    
@backoff.on_exception(backoff.expo,
                      Exception,
                      max_tries=10)
def get_dswx_meta_from_hls_id(hls_id: str) -> str:
    tokens = hls_id.split('.')
    mgrs_tile_id = tokens[2]
    acq_time_hls = tokens[3]
    acq_time = format_hls_dt(acq_time_hls)
    req_url = base_url + f'granule_ur=OPERA_L3_DSWx-HLS_{mgrs_tile_id}*{acq_time}*&options[granule_ur][pattern]=true'
    resp = requests.get(req_url).json()
    hits = resp['hits']
    out = {'dswx_hits': hits, 'dswx_request_url': req_url}
    if hits:
        dswx_id = resp['items'][0]['meta']['native-id']
        out.update({'dswx_id': dswx_id})
        
        file_name = f'{dswx_id}_B01_WTR.tif'
        url = f'{podaac_url}/{file_name}'
        out.update({'dswx_url': url})
    return out

In [44]:
r = get_dswx_meta_from_hls_id(sample_hls_id_s2a)
r

{'dswx_hits': 1,
 'dswx_request_url': 'https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=OPERA_L3_DSWx-HLS_T25MER*20230504T123259*&options[granule_ur][pattern]=true',
 'dswx_id': 'OPERA_L3_DSWx-HLS_T25MER_20230504T123259Z_20230506T164750Z_S2B_30_v1.0',
 'dswx_url': 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/OPERA_L3_DSWX-HLS_PROVISIONAL_V1/OPERA_L3_DSWx-HLS_T25MER_20230504T123259Z_20230506T164750Z_S2B_30_v1.0_B01_WTR.tif'}

In [45]:
r = get_dswx_meta_from_hls_id(sample_hls_id_l8)
r

{'dswx_hits': 1,
 'dswx_request_url': 'https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=OPERA_L3_DSWx-HLS_T18TWM*20230505T153242*&options[granule_ur][pattern]=true',
 'dswx_id': 'OPERA_L3_DSWx-HLS_T18TWM_20230505T153242Z_20230507T081905Z_L8_30_v1.0',
 'dswx_url': 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/OPERA_L3_DSWX-HLS_PROVISIONAL_V1/OPERA_L3_DSWx-HLS_T18TWM_20230505T153242Z_20230507T081905Z_L8_30_v1.0_B01_WTR.tif'}

# Getting Sensors

In [5]:
df_hls = gpd.read_file('hls_metadata_with_tags.geojson')
df_hls.head()

Unnamed: 0,granule_id,time_acquired,time_updated,B04_link,time_acq_str,mgrs_tile_id,accode,add_offset,area_or_point,arop_ave_xshift(meters),...,msi band 03 bandpass adjustment slope and offset,msi band 04 bandpass adjustment slope and offset,msi band 11 bandpass adjustment slope and offset,msi band 12 bandpass adjustment slope and offset,msi band 8a bandpass adjustment slope and offset,processing_baseline,product_uri,spacecraft_name,tile_id,geometry
0,HLS.L30.T18GYN.2023124T142433.v2.0,2023-05-04T14:24:33.166000+00:00,2023-05-06T07:16:52.481998+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T142433Z,T18GYN,Lasrc; Lasrc,0.0,Area,"0, 0",...,,,,,,,,,,"POLYGON ((-72.32396 -47.90997, -71.04347 -47.8..."
1,HLS.L30.T18GYP.2023124T142433.v2.0,2023-05-04T14:24:33.166000+00:00,2023-05-06T07:17:49.271000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T142433Z,T18GYP,Lasrc; Lasrc,0.0,Area,"0, 0",...,,,,,,,,,,"POLYGON ((-72.36920 -47.01149, -70.92768 -46.9..."
2,HLS.L30.T18GXP.2023124T142433.v2.0,2023-05-04T14:24:33.166000+00:00,2023-05-06T07:21:07.122000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T142433Z,T18GXP,Lasrc; Lasrc,0.0,Area,"0, 0",...,,,,,,,,,,"POLYGON ((-73.32739 -47.02950, -72.23989 -47.0..."
3,HLS.L30.T19GCJ.2023124T142433.v2.0,2023-05-04T14:24:33.166000+00:00,2023-05-06T07:30:42.305000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T142433Z,T19GCJ,Lasrc; Lasrc,0.0,Area,"0, 0",...,,,,,,,,,,"POLYGON ((-70.73825 -47.02852, -70.39345 -46.0..."
4,HLS.S30.T19GGM.2023124T140709.v2.0,2023-05-04T14:24:56.969002+00:00,2023-05-06T07:43:05.363000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T19GGM,LaSRC,0.0,Area,0,...,"1.007500, -0.000800","0.976100, 0.001000","1.000000, -0.000300","0.986700, 0.000400","0.996600, 0.000000",5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T19GGM_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-66.49269 -44.31394, -65.22951 -44.2..."


In [6]:
def get_input_sensor_name(row):
    landsat_id = row['landsat_scene_id']
    if isinstance(landsat_id, str):
        sensor_name = 'Landsat-9' if landsat_id[:3] == 'LC9' else 'Landsat-8'
    else:
        sensor_name = row['spacecraft_name']
    return sensor_name

df_hls['input_sensor_name'] = df_hls.aggregate(get_input_sensor_name, axis=1)

In [7]:
df_hls.input_sensor_name.unique()

array(['Landsat-9', 'Sentinel-2B', 'Landsat-8', 'Sentinel-2A'],
      dtype=object)

In [8]:
df_hls_for_dswx = df_hls[~df_hls.input_sensor_name.isin(['Landsat-9'])].reset_index(drop=True)

In [9]:
df_hls.shape[0], df_hls_for_dswx.shape[0]

(67480, 52260)

In [10]:
hls_ids = df_hls_for_dswx.granule_id.tolist()

In [46]:
dswx_data = list(map(get_dswx_meta_from_hls_id, tqdm(hls_ids[:10])))
dswx_data

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.11it/s]


[{'dswx_hits': 1,
  'dswx_request_url': 'https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=OPERA_L3_DSWx-HLS_T19GGM*20230504T140709*&options[granule_ur][pattern]=true',
  'dswx_id': 'OPERA_L3_DSWx-HLS_T19GGM_20230504T140709Z_20230506T163125Z_S2B_30_v1.0',
  'dswx_url': 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/OPERA_L3_DSWX-HLS_PROVISIONAL_V1/OPERA_L3_DSWx-HLS_T19GGM_20230504T140709Z_20230506T163125Z_S2B_30_v1.0_B01_WTR.tif'},
 {'dswx_hits': 1,
  'dswx_request_url': 'https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=OPERA_L3_DSWx-HLS_T19GFM*20230504T140709*&options[granule_ur][pattern]=true',
  'dswx_id': 'OPERA_L3_DSWx-HLS_T19GFM_20230504T140709Z_20230506T163052Z_S2B_30_v1.0',
  'dswx_url': 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/OPERA_L3_DSWX-HLS_PROVISIONAL_V1/OPERA_L3_DSWx-HLS_T19GFM_20230504T140709Z_

In [None]:
n = len(hls_ids)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    dswx_data = list(tqdm(executor.map(get_dswx_meta_from_hls_id, hls_ids[:]), total=n))

 91%|█████████████████████████████████████████████████████████████████████████████████▊        | 47534/52260 [1:35:56<13:02,  6.04it/s]

In [None]:
df_linked_dswx = pd.DataFrame(dswx_data)
df_linked_dswx.head()

In [36]:
df_final = pd.concat([df_hls_for_dswx, df_linked_dswx], axis=1)
df_final.head()

Unnamed: 0,granule_id,time_acquired,time_updated,B04_link,time_acq_str,mgrs_tile_id,accode,add_offset,area_or_point,arop_ave_xshift(meters),...,processing_baseline,product_uri,spacecraft_name,tile_id,geometry,input_sensor_name,dswx_hits,dswx_request_url,dswx_id,dswx_url
0,HLS.S30.T19GGM.2023124T140709.v2.0,2023-05-04T14:24:56.969002+00:00,2023-05-06T07:43:05.363000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T19GGM,LaSRC,0.0,Area,0,...,5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T19GGM_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-66.49269 -44.31394, -65.22951 -44.2...",Sentinel-2B,1,https://cmr.earthdata.nasa.gov/search/granules...,OPERA_L3_DSWx-HLS_T19GGM_20230504T140709Z_2023...,https://archive.podaac.earthdata.nasa.gov/poda...
1,HLS.S30.T19GFM.2023124T140709.v2.0,2023-05-04T14:25:01.399000+00:00,2023-05-06T08:26:16.794001+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T19GFM,LaSRC,0.0,Area,0,...,5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T19GFM_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-67.74565 -44.33458, -66.36943 -44.3...",Sentinel-2B,1,https://cmr.earthdata.nasa.gov/search/granules...,OPERA_L3_DSWx-HLS_T19GFM_20230504T140709Z_2023...,https://archive.podaac.earthdata.nasa.gov/poda...
2,HLS.S30.T19GEM.2023124T140709.v2.0,2023-05-04T14:25:05.561000+00:00,2023-05-06T08:08:04.328000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T19GEM,LaSRC,0.0,Area,0,...,5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T19GEM_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-69.00025 -44.34147, -67.62300 -44.3...",Sentinel-2B,1,https://cmr.earthdata.nasa.gov/search/granules...,OPERA_L3_DSWx-HLS_T19GEM_20230504T140709Z_2023...,https://archive.podaac.earthdata.nasa.gov/poda...
3,HLS.S30.T20GLR.2023124T140709.v2.0,2023-05-04T14:25:06.873000+00:00,2023-05-06T07:34:01.893000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T20GLR,LaSRC,0.0,Area,0,...,5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T20GLR_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-65.54398 -45.21299, -65.27982 -44.4...",Sentinel-2B,1,https://cmr.earthdata.nasa.gov/search/granules...,OPERA_L3_DSWx-HLS_T20GLR_20230504T140709Z_2023...,https://archive.podaac.earthdata.nasa.gov/poda...
4,HLS.S30.T19GGL.2023124T140709.v2.0,2023-05-04T14:25:11.157000+00:00,2023-05-06T08:40:58.476002+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T140709Z,T19GGL,LaSRC,0.0,Area,0,...,5.09,S2B_MSIL1C_20230504T140709_N0509_R110_T19GGL_2...,Sentinel-2B,S2B_OPER_MSI_L1C_TL_2BPS_20230504T172832_A0321...,"POLYGON ((-66.45347 -45.21293, -65.53571 -45.1...",Sentinel-2B,1,https://cmr.earthdata.nasa.gov/search/granules...,OPERA_L3_DSWx-HLS_T19GGL_20230504T140709Z_2023...,https://archive.podaac.earthdata.nasa.gov/poda...


In [37]:
df_final.to_file('hls_metadata_with_linked_dswx.geojson', driver='GeoJSON')

# Missing HLS Tiles

In [38]:
df_final[df_final.dswx_hits == 0].shape[0], df_final.shape[0]

(117, 52260)

In [39]:
df_final[df_final.dswx_hits == 0].head()

Unnamed: 0,granule_id,time_acquired,time_updated,B04_link,time_acq_str,mgrs_tile_id,accode,add_offset,area_or_point,arop_ave_xshift(meters),...,processing_baseline,product_uri,spacecraft_name,tile_id,geometry,input_sensor_name,dswx_hits,dswx_request_url,dswx_id,dswx_url
3331,HLS.L30.T20XMN.2023124T224909.v2.0,2023-05-04T22:49:09.524000+00:00,2023-05-09T00:18:57.339001+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T224909Z,T20XMN,Lasrc,0.0,Area,0,...,,,,,"POLYGON ((-62.60090 78.84424, -62.54754 78.858...",Landsat-8,0,https://cmr.earthdata.nasa.gov/search/granules...,,
3332,HLS.L30.T20XNN.2023124T224909.v2.0,2023-05-04T22:49:09.524000+00:00,2023-05-09T00:19:37.797001+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T224909Z,T20XNN,Lasrc,0.0,Area,0,...,,,,,"POLYGON ((-62.60090 78.84424, -60.96823 79.262...",Landsat-8,0,https://cmr.earthdata.nasa.gov/search/granules...,,
3333,HLS.L30.T20XNQ.2023124T224909.v2.0,2023-05-04T22:49:09.524000+00:00,2023-05-09T00:20:48.598999+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T224909Z,T20XNQ,Lasrc,0.0,Area,0,...,,,,,"POLYGON ((-57.49116 80.03205, -57.28716 80.072...",Landsat-8,0,https://cmr.earthdata.nasa.gov/search/granules...,,
3334,HLS.L30.T21XVK.2023124T224909.v2.0,2023-05-04T22:49:09.524000+00:00,2023-05-09T00:21:48.098999+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T224909Z,T21XVK,Lasrc,0.0,Area,0,...,,,,,"POLYGON ((-62.18622 80.03720, -57.26458 80.077...",Landsat-8,0,https://cmr.earthdata.nasa.gov/search/granules...,,
3335,HLS.L30.T20XNP.2023124T224909.v2.0,2023-05-04T22:49:09.524000+00:00,2023-05-09T00:21:59.944000+00:00,https://data.lpdaac.earthdatacloud.nasa.gov/lp...,2023124T224909Z,T20XNP,Lasrc,0.0,Area,0,...,,,,,"POLYGON ((-61.31486 79.17701, -57.28901 80.072...",Landsat-8,0,https://cmr.earthdata.nasa.gov/search/granules...,,


In [40]:
df_missing = df_final[df_final.dswx_hits == 0].reset_index(drop=True)

In [41]:
df_missing_f = df_missing[['granule_id', 'mgrs_tile_id', 'time_acquired', 'time_updated','B04_link', 'dswx_hits', 'dswx_id', 'dswx_url', 'input_sensor_name', 'geometry']].copy()

In [42]:
out_dir = Path('out')
out_dir.mkdir(exist_ok=True)

fname = 'hls_tiles_with_missing_dswx_products'
df_missing_f.to_csv(out_dir / f'{fname}.csv', index=False)
df_missing_f.to_file(out_dir / f'{fname}.geojson', driver='GeoJSON')

Note: there are instance when there are multiple hits like this: https://cmr.earthdata.nasa.gov/search/granules.umm_json?short_name=OPERA_L3_DSWX-HLS_PROVISIONAL_V1&granule_ur=*T18XVM*20230509T184921*&options[granule_ur][pattern]=true

This is because there is the time in which the product was processed and acquired. The latter is what we are interested. However, in rare instances we get a tile that was processed at the precise time a previous one was acquired.