In [1]:
from cmr import GranuleQuery
import datetime
from shapely.geometry import Polygon
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from osgeo import gdal
import rasterio
import concurrent.futures

# GDAL configs used to successfully access LP DAAC Cloud Assets via vsicurl 
gdal.SetConfigOption("GDAL_HTTP_UNSAFESSL", "YES")
gdal.SetConfigOption('GDAL_HTTP_COOKIEFILE','~/cookies.txt')
gdal.SetConfigOption('GDAL_HTTP_COOKIEJAR', '~/cookies.txt')
gdal.SetConfigOption('GDAL_DISABLE_READDIR_ON_OPEN','YES')
gdal.SetConfigOption('GDAL_DISABLE_READDIR_ON_OPEN','FALSE')
gdal.SetConfigOption('CPL_VSIL_CURL_ALLOWED_EXTENSIONS','TIF')

# Constants

In [2]:
HLS_S30_CONCEPT_ID = 'C2021957295-LPCLOUD'
HLS_L30_CONCEPT_ID = 'C2021957657-LPCLOUD'
DSWX_CONCEPT_ID = 'C2617126679-POCLOUD'

# Parameters

In [3]:
now = datetime.datetime.now()
delta = datetime.timedelta(days=7)

beg_date = now - delta
end_date = now

# HLS

In [4]:
api = GranuleQuery()

In [5]:
concept_ids = [HLS_L30_CONCEPT_ID, HLS_S30_CONCEPT_ID]

In [6]:
q = api.concept_id(concept_ids).temporal(beg_date, end_date)

In [7]:
q.hits()

66155

In [8]:
q.get(1)

[{'producer_granule_id': 'HLS.L30.T23XNL.2023122T171828',
  'time_start': '2023-05-02T17:18:28.106Z',
  'cloud_cover': '1',
  'updated': '2023-05-04T08:17:35.893Z',
  'dataset_id': 'HLS Landsat Operational Land Imager Surface Reflectance and TOA Brightness Daily Global 30m v2.0',
  'data_center': 'LPCLOUD',
  'title': 'HLS.L30.T23XNL.2023122T171828.v2.0',
  'coordinate_system': 'GEODETIC',
  'day_night_flag': 'DAY',
  'time_end': '2023-05-02T18:56:09.487Z',
  'id': 'G2676661937-LPCLOUD',
  'original_format': 'ECHO10',
  'browse_flag': True,
  'polygons': [['80.9257782 -39.0873183 81.0108644 -38.6944185 81.8974237 -38.0067263 81.9569194 -45.0012803 80.9732637 -45.0011418 80.9257782 -39.0873183']],
  'collection_concept_id': 'C2021957657-LPCLOUD',
  'online_access_flag': True,
  'links': [{'rel': 'http://esipfed.org/ns/fedsearch/1.1/data#',
    'title': 'Download HLS.L30.T23XNL.2023122T171828.v2.0.B07.tif',
    'hreflang': 'en-US',
    'href': 'https://data.lpdaac.earthdatacloud.nasa.gov

In [None]:
%%time

hls_metadata = q.get_all()

## Format

In [10]:
def dateline_fix_x(x_coords):
    xmax = max(x_coords)
    xmin = min(x_coords)
    x_coords_new = x_coords
    if xmax - xmin > 180:
        x_coords_new = [x if x < 0 else x - 360 for x in x_coords]
    return x_coords_new

def format_hls(item):
    out = {}
    granule_id = item['title']
    out['granule_id'] = granule_id
    out['time_acquired'] = pd.to_datetime(item['time_start'])
    out['time_updated'] = pd.to_datetime(item['updated'])
    # B11 is smaller file size - faster to read mask
    out['B11_link'] = next(link_data['href'] for link_data in item['links'] 
                           if all(kw in link_data['href'] for kw in ['.B11.tif', 'https://']))
    
    poly_str = item['polygons'][0][0].split(' ')
    poly_floats = [float(coord) for coord in poly_str]
    n = len(poly_floats)
    # (lat, lon)
    poly_coords = [(poly_floats[k+1], poly_floats[k]) for k in range(0, n, 2)]
    x_coords, y_coords = zip(*poly_coords)
    x_coords = dateline_fix_x(x_coords)
    poly_coords = zip(x_coords, y_coords)
    geometry = Polygon(poly_coords)
    
    out['time_acq_str'] = granule_id.split('.')[2]
    out['mgrs_tile_id'] = granule_id.split('.')[3] + 'Z'
    out['geometry'] = geometry
    
    return out

In [11]:
hls_metadata_formatted = list(map(format_hls, tqdm(hls_metadata[:])))

NameError: name 'hls_metadata' is not defined

In [None]:
df = gpd.GeoDataFrame(hls_metadata_formatted)
df.head()

In [None]:
df.exterior.plot()

In [None]:
def get_mask_data_hls(url: str) -> dict:
    with rasterio.open(url) as ds:
        m = ~(ds.read_masks(1).astype(bool))
    total_nodata = m.sum()
    return {'total_nodata_pixels_hls': total_nodata, 
            'percent_nodata_hls': total_nodata / m.size * 100}

def get_mask_data_from_item(item: dict) -> dict:
    url = item['B11_link']
    return get_mask_data_hls(url)

In [None]:
no_data_info = list(map(get_mask_data_from_item, tqdm(hls_metadata_formatted[:4])))
no_data_info

In [None]:
n = len(hls_metadata_formatted)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    no_data_info = list(tqdm(executor.map(get_mask_data_from_item, hls_metadata_formatted[:4]), total=n))

In [None]:
hls_data_formatted_plus_nodata = [{**item, **nodata_item} for (item, nodata_item) in zip(hls_metadata_formatted, no_data_info)]
df_nodata = gpd.GeoDataFrame(hls_data_formatted_plus_nodata)

In [None]:
df.to_file('hls_metadata_with_nodata.geojson', driver='GeoJSON')