# Data downloads

Download required data not included in the repo

In [1]:
#pip -q install earthaccess

Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import requests
import zipfile
import geopandas as gpd
import earthaccess
from osgeo import gdal
import rioxarray

In [3]:
data_dir = "data/external/"

# GRWL
Global river widths from Landsat. Vector file with rivers visible from Landsat

In [11]:
# GRWL summary stats
# https://zenodo.org/records/1297434
url = 'https://zenodo.org/records/1297434/files/GRWL_summaryStats_V01.01.zip?download=1'
out_fn = data_dir + 'GRWL_summaryStats_V01.01.zip'
response = requests.get(url)
if response.status_code == 200:
    open(out_fn, 'wb').write(response.content)
    
# unzip
with zipfile.ZipFile(out_fn, "r") as zip_ref:
    zip_ref.extractall(data_dir + 'GRWL_summaryStats_V01')

# remove zipfile
os.remove(out_fn)

# HydroRIVERS
River reach vector file

In [18]:
# HydroRIVERS
# https://www.hydrosheds.org/products/hydrorivers
url = 'https://data.hydrosheds.org/file/HydroRIVERS/HydroRIVERS_v10_as_shp.zip'
out_fn = data_dir + 'HydroRIVERS_v10_as_shp.zip'
response = requests.get(url)
if response.status_code == 200:
    open(out_fn, 'wb').write(response.content)
    
# unzip
with zipfile.ZipFile(out_fn, "r") as zip_ref:
    zip_ref.extractall(data_dir) # it will unzip into a directory

# remove zipfile
os.remove(out_fn)

# Country boundaries
For all of Asia

In [17]:
# asia polygons (for visualization)
# in this case, just load directly into memory and only 
# save a simplified version
url = 'https://stacks.stanford.edu/file/druid:yg089df0008/data.zip'
out_fn = data_dir + 'asia_polygons.gpkg'

asia_polygons = gpd.read_file(url)
asia = asia_polygons.to_crs('EPSG:32648')
asia['geometry'] = asia.simplify(1000)
asia = asia.to_crs(asia_polygons.crs)
asia = asia[asia['REGION'] == 'ASIA']
asia.to_file(out_fn)

# Water classification
Water classification raster

In [4]:
# JRC yearly water classification
url = 'https://storage.googleapis.com/global-surface-water/downloads2021/occurrence/occurrence_100E_20Nv1_4_2021.tif'
out_fn = data_dir + 'occurrence_100E_20Nv1_4_2021.tif'
response = requests.get(url)
if response.status_code == 200:
            #Write to disk
            open(out_fn, 'wb').write(response.content)




# Tonle Sap Lake
Shapefile of TSL for plotting

In [33]:
# Tonle Sap Lake
# from https://data.opendevelopmentmekong.net/dataset/water-bodies-in-cambodia
url = 'https://data.opendevelopmentcambodia.net/en/dataset/3c74812b-0a78-4407-a072-6f91c58de45b/resource/a76793df-76aa-469a-97bc-ae3a04b6faad/download/water-bodies.geojson'
out_fn = 'data/' + 'tonle_sap_lake.geojson'
lakes = gpd.read_file(url)
tsl = lakes.loc[lakes['name'] == 'Boeung Tonle Sap']
tsl.to_file(out_fn)

# Country boundaries
Shapefile with country boundaries for plotting

In [61]:
# LMB (just remove China from Mekong shapefile)
countries = gpd.read_file(data_dir + 'asia_polygons.gpkg')
lmb_countries = [
    'VIETNAM',
    'THAILAND',
    'CAMBODIA', 
    'LAOS',
    'BURMA'
]
mekong = gpd.read_file('data/mekong.geojson')
lmb = mekong.clip(countries[countries['NAME'].isin(lmb_countries)])
lmb.to_file('data/lmb.geojson')

## Load HLS dataset over the extent needed
Use this as a template for setting up masks etc

In [10]:
# open HLS dataset using rioxarray
# EarthAccess setup for HLS
# https://github.com/nasa/HLS-Data-Resources/blob/main/python/tutorials/HLS_Tutorial.ipynb

earthaccess.login(persist=True)

# GDAL configurations used to successfully access LP DAAC Cloud Assets via vsicurl 
gdal.SetConfigOption('GDAL_HTTP_COOKIEFILE','~/cookies.txt')
gdal.SetConfigOption('GDAL_HTTP_COOKIEJAR', '~/cookies.txt')
gdal.SetConfigOption('GDAL_DISABLE_READDIR_ON_OPEN','EMPTY_DIR')
gdal.SetConfigOption('CPL_VSIL_CURL_ALLOWED_EXTENSIONS','TIF')
gdal.SetConfigOption('GDAL_HTTP_UNSAFESSL', 'YES')

In [11]:
# hls url
# see 00-1_pull_hls_data to see where this url comes from
url_pxa = 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T48PXA.2023325T032029.v2.0/HLS.S30.T48PXA.2023325T032029.v2.0.B04.tif'
url_pwv = 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T48PWV.2023325T032029.v2.0/HLS.S30.T48PWV.2023325T032029.v2.0.B04.tif'


In [12]:

# Use vsicurl to load the data directly into memory (be patient, may take a few seconds)
chunk_size = dict(band=1, x=512, y=512) # Tiles have 1 band and are divided into 512x512 pixel chunks
# Sometimes a vsi curl error occurs so we need to retry if it does
max_retries = 10

# Load PXA
for _i in range(max_retries):
    try:
        # pull hls (red band)
        hls_red_pxa = rioxarray.open_rasterio(url_pxa, chunks=chunk_size, masked=True).squeeze('band')
        hls_red_pxa.attrs['scale_factor'] = 0.0001 # hard coded the scale_factor attribute
        break # Break out of the retry loop
    except Exception as ex:
        print(f"vsi curl error: {ex}. Retrying...")
else:
    print(f"Failed to process {url_pxa} after {max_retries} retries. Please check to see you're authenticated with earthaccess.")
    
# Load PWV 
for _i in range(max_retries):
    try:
        # pull hls (red band)
        hls_red_pwv = rioxarray.open_rasterio(url_pwv, chunks=chunk_size, masked=True).squeeze('band')
        hls_red_pwv.attrs['scale_factor'] = 0.0001 # hard coded the scale_factor attribute
        break # Break out of the retry loop
    except Exception as ex:
        print(f"vsi curl error: {ex}. Retrying...")
else:
    print(f"Failed to process {url_pwv} after {max_retries} retries. Please check to see you're authenticated with earthaccess.")
print("The COGs have been loaded into memory!")

The COGs have been loaded into memory!


In [13]:
# Create a window for HLS data
# Use utm rivers because that's what HLS is in
window_bounds = (560172.6343010075, 1450022.6611399346, 654025.1724448078, 1542844.933869364)


In [16]:
# merge the two images
# This takes kind of a long time
# but it creates a dataset that spans all the sample sites. 
hls_fn = data_dir + 'hls_region.nc'
if not os.path.exists(hls_fn):
    import rioxarray.merge
    # need to pass in list of xarray.DataSet so cast DataArrays to DataSets
    hls_merged = rioxarray.merge.merge_datasets([hls_red_pwv.to_dataset(name='red'), 
                                                 hls_red_pxa.to_dataset(name='red')], 
                                                bounds = window_bounds) 
    hls_merged.to_netcdf(hls_fn)