# Generating a Mosaicked Image of Lake Mead

## Outline of steps for analysis

+ Identifying search parameters
    + AOI, time-window
    + Endpoint, Provider, catalog identifier ("short name")
+ Obtaining search results
    + Instrospect, examine to identify features, bands of interest
    + Wrap results into a DataFrame for easier exploration
+ Exploring & refining search results
    + Identify granules of highest value
    + Filter extraneous granules with minimal contribution
    + Assemble relevant filtered granules into DataFrame
    + Identify kind of output to generate
+ Data-wrangling to produce relevant output
    + Download relevant granules into Xarray DataArray, stacked appropriately
    + Do intermediate computations as necessary
    + Assemble relevant data slices into visualization

---

### Preliminary imports

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')
# data wrangling imports
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray as rio
import rasterio

In [2]:
# Imports for plotting
import hvplot.pandas
import hvplot.xarray
import geoviews as gv
from geoviews import opts
gv.extension('bokeh')

In [3]:
# STAC imports to retrieve cloud data
from pystac_client import Client
from osgeo import gdal
# GDAL setup for accessing cloud data
gdal.SetConfigOption('GDAL_HTTP_COOKIEFILE','~/.cookies.txt')
gdal.SetConfigOption('GDAL_HTTP_COOKIEJAR', '~/.cookies.txt')
gdal.SetConfigOption('GDAL_DISABLE_READDIR_ON_OPEN','EMPTY_DIR')
gdal.SetConfigOption('CPL_VSIL_CURL_ALLOWED_EXTENSIONS','TIF, TIFF')

### Convenient utilities

These functions could be placed in module files for more developed research projects. For learning purposes, they are embedded within this notebook.

In [4]:
# simple utility to make a rectangle with given center of width dx & height dy
def make_bbox(pt,dx,dy):
    '''Returns bounding-box represented as tuple (x_lo, y_lo, x_hi, y_hi)
    given inputs pt=(x, y), width & height dx & dy respectively,
    where x_lo = x-dx/2, x_hi=x+dx/2, y_lo = y-dy/2, y_hi = y+dy/2.
    '''
    return tuple(coord+sgn*delta for sgn in (-1,+1) for coord,delta in zip(pt, (dx/2,dy/2)))

In [5]:
# simple utility to plot an AOI or bounding-box
def plot_bbox(bbox):
    '''Given bounding-box, returns GeoViews plot of Rectangle & Point at center
    + bbox: bounding-box specified as (lon_min, lat_min, lon_max, lat_max)
    Assume longitude-latitude coordinates.
    '''
    # These plot options are fixed but can be over-ridden
    point_opts = opts.Points(size=12, alpha=0.25, color='blue')
    rect_opts = opts.Rectangles(line_width=0, alpha=0.1, color='red')
    lon_lat = (0.5*sum(bbox[::2]), 0.5*sum(bbox[1::2]))
    return (gv.Points([lon_lat]) * gv.Rectangles([bbox])).opts(point_opts, rect_opts)

In [6]:
# utility to extract search results into a Pandas DataFrame
def search_to_dataframe(search):
    '''Constructs Pandas DataFrame from PySTAC Earthdata search results.
    DataFrame columns are determined from search item properties and assets.
    'asset': string identifying an Asset type associated with a granule
    'href': data URL for file associated with the Asset in a given row.'''
    granules = list(search.items())
    assert granules, "Error: empty list of search results"
    props = list({prop for g in granules for prop in g.properties.keys()})
    tile_ids = map(lambda granule: granule.id.split('_')[3], granules)
    rows = (([g.properties.get(k, None) for k in props] + [a, g.assets[a].href, t])
                for g, t in zip(granules,tile_ids) for a in g.assets )
    df = pd.concat(map(lambda x: pd.DataFrame(x, index=props+['asset','href', 'tile_id']).T, rows),
                   axis=0, ignore_index=True)
    assert len(df), "Empty DataFrame"
    return df

In [7]:
# utility to process DataFrame of search results & return DataArray of stacked raster images
def urls_to_stack(granule_dataframe):
    '''Processes DataFrame of PySTAC search results (with OPERA tile URLs) &
    returns stacked Xarray DataArray (dimensions time, latitude, & longitude)'''
    
    stack = []
    for i, row in granule_dataframe.iterrows():
        with rasterio.open(row.href) as ds:
            # extract CRS string
            crs = str(ds.crs).split(':')[-1]
            # extract the image spatial extent (xmin, ymin, xmax, ymax)
            xmin, ymin, xmax, ymax = ds.bounds
            # the x and y resolution of the image is available in image metadata
            x_res = np.abs(ds.transform[0])
            y_res = np.abs(ds.transform[4])
            # read the data 
            img = ds.read()
            # Ensure img has three dimensions (bands, y, x)
            if img.ndim == 2:
                img = np.expand_dims(img, axis=0) 
            lon = np.arange(xmin, xmax, x_res)
            lat = np.arange(ymax, ymin, -y_res)
            bands = np.arange(img.shape[0])
            da = xr.DataArray(
                                data=img,
                                dims=["band", "lat", "lon"],
                                coords=dict(
                                            lon=(["lon"], lon),
                                            lat=(["lat"], lat),
                                            time=i,
                                            band=bands
                                            ),
                                attrs=dict(
                                            description="OPERA DSWx B01",
                                            units=None,
                                          ),
                             )
            da.rio.write_crs(crs, inplace=True)   
            stack.append(da)
    return xr.concat(stack, dim='time').squeeze()

---

## Identifying search parameters

In [8]:
lake_mead = (-114.754, 36.131)
AOI = make_bbox(lake_mead, 0.1, 0.1)
DATE_RANGE = "2023-03-01/2023-04-15"

In [17]:
# Optionally plot the AOI
basemap = gv.tile_sources.OSM(width=500, height=500, padding=0.1)
plot_bbox(AOI) * basemap

In [10]:
search_params = dict(bbox=AOI, datetime=DATE_RANGE)
print(search_params)

{'bbox': (-114.804, 36.081, -114.70400000000001, 36.181), 'datetime': '2023-03-01/2023-04-15'}


---

## Obtaining search results

In [11]:
ENDPOINT = 'https://cmr.earthdata.nasa.gov/stac'
PROVIDER = 'POCLOUD'
COLLECTIONS = ["OPERA_L3_DSWX-HLS_V1_1.0"]
# Update the dictionary opts with list of collections to search
search_params.update(collections=COLLECTIONS)
print(search_params)

{'bbox': (-114.804, 36.081, -114.70400000000001, 36.181), 'datetime': '2023-03-01/2023-04-15', 'collections': ['OPERA_L3_DSWX-HLS_V1_1.0']}


In [12]:
catalog = Client.open(f'{ENDPOINT}/{PROVIDER}/')
search_results = catalog.search(**search_params)

In [24]:
df = search_to_dataframe(search_results)
display(df.head())
df.info()

Unnamed: 0,eo:cloud_cover,end_datetime,start_datetime,datetime,asset,href,tile_id
0,8,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,browse,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
1,8,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,thumbnail_0,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2,8,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,thumbnail_1,s3://podaac-ops-cumulus-protected/OPERA_L3_DSW...,T11SPA
3,8,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,0_B01_WTR,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
4,8,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,2023-04-09T18:14:46.025Z,0_B02_BWTR,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eo:cloud_cover  224 non-null    object
 1   end_datetime    224 non-null    object
 2   start_datetime  224 non-null    object
 3   datetime        224 non-null    object
 4   asset           224 non-null    object
 5   href            224 non-null    object
 6   tile_id         224 non-null    object
dtypes: object(7)
memory usage: 12.4+ KB


Clean DataFrame `df` in ways that make sense (e.g., dropping unneeded columns/rows, casting columns as fixed datatypes, setting the index, etc.).

In [25]:
df.datetime = pd.DatetimeIndex(df.datetime)
df = df.drop(['start_datetime', 'end_datetime'], axis=1)
df = df.rename({'eo:cloud_cover':'cloud_cover'}, axis=1)
df['cloud_cover'] = df['cloud_cover'].astype(np.float16)
for col in ['asset', 'href', 'tile_id']:
    df[col] = df[col].astype(pd.StringDtype())
df = df.set_index('datetime').sort_index()

In [26]:
display(df.head())
df.info()

Unnamed: 0_level_0,cloud_cover,asset,href,tile_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-09 18:14:46.025000+00:00,8.0,browse,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-09 18:14:46.025000+00:00,8.0,thumbnail_0,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-09 18:14:46.025000+00:00,8.0,thumbnail_1,s3://podaac-ops-cumulus-protected/OPERA_L3_DSW...,T11SPA
2023-04-09 18:14:46.025000+00:00,8.0,0_B01_WTR,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-09 18:14:46.025000+00:00,8.0,0_B02_BWTR,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 224 entries, 2023-04-09 18:14:46.025000+00:00 to 2023-04-15 18:34:28.755000+00:00
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cloud_cover  224 non-null    float16
 1   asset        224 non-null    string 
 2   href         224 non-null    string 
 3   tile_id      224 non-null    string 
dtypes: float16(1), string(3)
memory usage: 7.4 KB


---

## Exploring & refining search results

This consists of filtering rows or columns appropriately to narrow the search results down to the raster data files most suitable to analysis and/or visualization. This can mean focussing on certain geographic tiles, specific bands of the data product, certains dates/timestamps, etc.

In [28]:
df.index.unique()

DatetimeIndex(['2023-04-09 18:14:46.025000+00:00',
               '2023-04-09 18:15:09.916000+00:00',
               '2023-04-10 18:34:09.947000+00:00',
               '2023-04-10 18:34:13.130000+00:00',
               '2023-04-10 18:34:24.485000+00:00',
               '2023-04-10 18:34:27.817000+00:00',
               '2023-04-12 18:24:14.671000+00:00',
               '2023-04-12 18:24:28.477000+00:00',
               '2023-04-13 18:44:08.381000+00:00',
               '2023-04-13 18:44:20.386000+00:00',
               '2023-04-15 18:34:10.879000+00:00',
               '2023-04-15 18:34:14.050000+00:00',
               '2023-04-15 18:34:25.444000+00:00',
               '2023-04-15 18:34:28.755000+00:00'],
              dtype='datetime64[ns, UTC]', name='datetime', freq=None)

In [30]:
df.asset.value_counts()

asset
browse         16
thumbnail_0    16
thumbnail_1    16
0_B01_WTR      16
0_B02_BWTR     16
0_B03_CONF     16
0_B04_DIAG     16
0_B05_WTR-1    16
0_B06_WTR-2    16
0_B07_LAND     16
0_B08_SHAD     16
0_B09_CLOUD    16
0_B10_DEM      16
metadata       16
Name: count, dtype: Int64

In [29]:
df.cloud_cover.agg(['min','mean','median','max'])

min        0.0000
mean      12.5625
median     1.0000
max       93.0000
Name: cloud_cover, dtype: float16

In [32]:
c1 = (df.cloud_cover <= 10)
c2 = (df.asset.str.contains('B01_WTR'))
b01_wtr = df.loc[c1 & c2].drop(['asset', 'cloud_cover'], axis=1)
b01_wtr

Unnamed: 0_level_0,href,tile_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-09 18:14:46.025000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-09 18:14:46.025000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SQA
2023-04-09 18:15:09.916000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SQV
2023-04-09 18:15:09.916000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPV
2023-04-10 18:34:09.947000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SQA
2023-04-10 18:34:13.130000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-10 18:34:24.485000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SQV
2023-04-10 18:34:27.817000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPV
2023-04-13 18:44:08.381000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPA
2023-04-13 18:44:20.386000+00:00,https://archive.podaac.earthdata.nasa.gov/poda...,T11SPV


---

In [33]:
b01_wtr.tile_id.value_counts()

tile_id
T11SPA    4
T11SPV    4
T11SQA    3
T11SQV    3
Name: count, dtype: Int64

## Data-wrangling to produce relevant output

This can include stacking two-dimensional arrays into a three-dimensional array, mosaicking raster images from adjacent tiles into a single tile, etc.

In [34]:
b01_wtr.groupby('tile_id')['href'].apply(list)

tile_id
T11SPA    [https://archive.podaac.earthdata.nasa.gov/pod...
T11SPV    [https://archive.podaac.earthdata.nasa.gov/pod...
T11SQA    [https://archive.podaac.earthdata.nasa.gov/pod...
T11SQV    [https://archive.podaac.earthdata.nasa.gov/pod...
Name: href, dtype: object

In [35]:
from rasterio.merge import merge
from rasterio.transform import array_bounds

In [36]:
%%time
mosaicked_img, mosaic_transform = merge(list(granules.href))

NameError: name 'granules' is not defined

---