In [None]:
import datetime
import glob
import os
import earthaccess
import numpy as np
import h5py
import xarray as xr
import rasterio
import rioxarray
import py4eos
import pyproj
from shapely.geometry import Polygon
from rasterio.warp import calculate_default_transform, reproject, Resampling
from matplotlib import pyplot

auth = earthaccess.login()

LC_DIR = 'data/MCD12Q1'
VNP16_DIR = 'data/VNP16A2GF'
IMERG_DIR = 'data/IMERG'
OUTPUT_LC_FILENAME = 'data/processed/MODIS_MCD12Q1_Type5_cereal_croplands_h15v05_2023.tiff'
OUTPUT_VNP16_DIR = 'data/processed/VNP16_ET_and_PET'
OUTPUT_IMERG_DIR = 'data/processed'
TIME_PERIOD = ('2023-10-01', '2024-09-30')

```python
help(earthaccess.search_data)
```

```
**bounding_box**: a tuple representing spatial bounds in the form
    `(lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat)`
```

In [None]:
bbox = (1.5, 34.0, 8.0, 37.0)

## Getting a land-cover map for our study area

DOI: https://doi.org/10.5067/MODIS/MCD12Q1.061

In [None]:
results = earthaccess.search_data(
    short_name = 'MCD12Q1',
    temporal = TIME_PERIOD,
    bounding_box = tuple(bbox))

In [None]:
len(results)

In [None]:
# Only download the files once; i.e., if we haven't already downloaded any
if len(glob.glob(f'{LC_DIR}/*')) == 0:
    earthaccess.download(results, LC_DIR)

In [None]:
hdf = py4eos.read_file('data/MCD12Q1/MCD12Q1.A2023001.h18v05.061.2024252125305.hdf', platform = 'MODIS')
hdf

In [None]:
lc_raster = hdf.to_rasterio('LC_Type5', filename = '', driver = 'MEM')

In [None]:
# NOTE: Resampling to 1-km resolution
# NOTE: The file is automatically closed after we exit the context
# NOTE: The flag "r+" is essential to both read from the dataset and change it (write data back)
# See more at: https://rasterio.readthedocs.io/en/stable/topics/resampling.html

# Resampling to 1-km resolution
lc_map = lc_raster.read(out_shape = (1200, 1200), resampling = Resampling.mode)
# Change the array contents to a binary [0,1] array, showing Croplands=1
lc_map = np.where(lc_map == 7, 1, 0)
    
pyplot.imshow(lc_map[0], interpolation = 'nearest')

In [None]:
# NOTE: We use this to figure out the new width and height, primarily

new_transform, width, height = calculate_default_transform(
    lc_raster.crs, pyproj.CRS(6933), 1200, 1200, *lc_raster.bounds)

In [None]:
# NOTE: 'w' flag
# NOTE: dtype

# Write the resized array to a raster dataset
resized_raster = rasterio.open(
    '', 'w+', driver = 'MEM', count = 1, width = 1200, height = 1200,
    dtype = np.uint8, crs = lc_raster.crs, transform = lc_raster.transform * lc_raster.transform.scale(2))
resized_raster.write(lc_map[0], 1)

output_raster = rasterio.open(
    OUTPUT_LC_FILENAME, 'w', count = 1, width = width, height = height,
    dtype = np.uint8, crs = lc_raster.crs, transform = new_transform)

# Writing the new array to the file
reproject(
    source = rasterio.band(resized_raster, 1),
    destination = rasterio.band(output_raster, 1),
    resampling = Resampling.nearest,
    src_nodata = 0,
    dst_nodata = 0)
output_raster.close()

## Downloading ET data

NOTE that we could have used the Hargreave's equation (M2) instead

In [None]:
results = earthaccess.search_data(
    short_name = 'VNP16A2GF',
    temporal = TIME_PERIOD,
    bounding_box = tuple(bbox))

In [None]:
# "Check that we're getting N results for N dates"

len(results)

In [None]:
# Only download the files once; i.e., if we haven't already downloaded any
if len(glob.glob(f'{VNP16_DIR}/*')) == len(results):
    earthaccess.download(results, VNP16_DIR)

In [None]:
hdf = py4eos.read_file('data/VNP16A2GF/VNP16A2GF.A2024001.h18v05.002.2025021191652.h5', platform = 'VIIRS')
hdf

In [None]:
# TODO Note scale factors and valid min/max

et0 = hdf.get('ET_500m')
et = np.where(np.abs(et0) >= 32700, np.nan, et0 * 0.1)

In [None]:
pyplot.imshow(et, interpolation = 'nearest')
pyplot.colorbar()

In [None]:
def reproject_viirs(hdf, field, output_path = '', driver = 'MEM'):
    '''
    Reprojects a VIIRS ET dataset to the global EASE-Grid 2.0.

    Parameters
    ----------
    hdf : py4eos.EOSHDF4
        The EOSHDF4 instance connected to the VIIRS ET dataset
    field : str
        The name of the data variable, e.g., "ET_500m"
    output_path : str
        (Optional) The file path, if writing to a file on disk
    driver : str
        (Optional) The driver name, defaults to "MEM"

    Returns
    -------
    rasterio.io.DatasetWriter
    '''
    et_raster = hdf.to_rasterio(
        field, filename = '', driver = 'MEM', nodata = 32766., scale_and_offset = True)
    
    # First, resample the ET data to 1-km resolution
    arr = et_raster.read(out_shape = (1200, 1200), resampling = Resampling.average)
    arr = np.where(np.abs(arr) >= 32700, np.nan, arr)
    # We have to re-create the raster dataset, now at 1-km resolution
    et_raster_1km = rasterio.open(
        '', 'w+', driver = 'MEM', height = 1200, width = 1200,
        count = 1, dtype = np.float32, crs = et_raster.crs, 
        transform = et_raster.transform * et_raster.transform.scale(2)) # NOTE: Scaling to 1 km
    et_raster_1km.write(arr[0], 1)
    
    # Second, project the data onto a global EASE-Grid 2.0
    new_transform, width, height = calculate_default_transform(
        et_raster_1km.crs, pyproj.CRS(6933), 1200, 1200, *et_raster_1km.bounds)
    et_raster_ease2 = rasterio.open(
        output_path, 'w+', driver = driver, height = height, width = width,
        count = 1, dtype = np.float32, crs = pyproj.CRS(6933), transform = new_transform)
    reproject(
        source = rasterio.band(et_raster_1km, 1),
        destination = rasterio.band(et_raster_ease2, 1),
        resampling = Resampling.bilinear,
        src_nodata = np.nan, # Necessary so that missing data is interpolated
        dst_nodata = np.nan)
    return et_raster_ease2

In [None]:
et_raster_ease2 = reproject_viirs(hdf, 'ET_500m')
img = et_raster_ease2.read(1)
pyplot.imshow(img, interpolation = 'nearest')
pyplot.colorbar()

In [None]:
from tqdm import tqdm

file_list = glob.glob(f'{VNP16_DIR}/*')
file_list.sort()

for filename in tqdm(file_list):
    date = datetime.datetime.strptime(filename.split('/')[-1].split('.')[1], 'A%Y%j')
    date_str = date.strftime('%Y%m%d')
    output_file_tpl = f'{OUTPUT_VNP16_DIR}/VNP16_%s_mm_8day-1_{date_str}.tiff'
    hdf = py4eos.read_file(filename, platform = 'VIIRS')
    et = reproject_viirs(hdf, 'ET_500m', output_file_tpl % 'ET', driver = 'GTiff')
    pet = reproject_viirs(hdf, 'PET_500m', output_file_tpl % 'PET', driver = 'GTiff')

## TODO Adding ET data processing to SnakeMake

In [None]:
results = earthaccess.search_data(
    short_name = 'VNP16A2GF',
    temporal = TIME_PERIOD,
    bounding_box = tuple(bbox))

## Getting precipitation data from IMERG

In [None]:
# TODO Getting the bounds of our VIIRS tile, for clipping other datasets

bb = et_raster_ease2.bounds
bounds = Polygon([
    (bb.left, bb.bottom), 
    (bb.left, bb.top),
    (bb.right, bb.top),
    (bb.right, bb.bottom)
])
bounds

In [None]:
# TODO Turn this into an exercise for participants?

results = earthaccess.search_data(
    short_name = 'GPM_3IMERGDF',
    temporal = TIME_PERIOD)

In [None]:
# Only download the files once; i.e., if we haven't already downloaded any
if len(glob.glob(f'{IMERG_DIR}/*')) == 0:
    earthaccess.download(results, IMERG_DIR)

In [None]:
from tqdm import tqdm

stack = []
for filename in tqdm(glob.glob(f'{IMERG_DIR}/*.nc4')):
    ds = xr.open_dataset(filename)
    ds_ease2 = ds[['precipitation']]\
        .transpose('time', 'lat', 'lon')\
        .rio.write_crs(4326)\
        .rio.set_spatial_dims('lon', 'lat')\
        .rio.reproject(pyproj.CRS(6933), resolution = 9000)\
        .rio.clip([bounds])
    stack.append(ds_ease2)

In [None]:
ds_precip = xr.concat(stack, dim = 'time')

## Packaging derived data products

In [None]:
OUTPUT_IMERG_DIR = 'data/processed'

# TODO note the units
ds_precip.to_netcdf(f'{OUTPUT_IMERG_DIR}/IMERG_precip_mm_day-1_for_study_area.nc4')

## TODO Adding precip data processing to SnakeMake