# M3.2 - Analyzing a Global Precipitation Data Cube

*Part of:* [**Open Science for Water Resources**](https://github.com/OpenClimateScience/M3-Open-Science-for-Water-Resources)

In [None]:
import datetime
import earthaccess
import numpy as np
import h5py
import xarray as xr
from matplotlib import pyplot

auth = earthaccess.login()

$$
P = E + R + \Delta S
$$

![](./assets/water_budget.png)

[*Image courtesy of the USGS*](https://www.usgs.gov/media/images/components-a-simple-water-budget-part-a-watershed)

---

## Calculating basin-scale precipitation

https://dx.doi.org/10.5067/GPM/IMERG/3B-MONTH/07

In [None]:
import geopandas

basin = geopandas.read_file('/home/arthur.endsley/Workspace/NTSG/projects/Y2024_TOPS_Training/data/YellowstoneRiver_drainage_WSG84.shp')
river = geopandas.read_file('/home/arthur.endsley/Workspace/NTSG/projects/Y2024_TOPS_Training/data/YellowstoneRiver_course_WSG84.shp')
states = geopandas.read_file('/home/arthur.endsley/Workspace/NTSG/projects/Y2024_TOPS_Training/data/YellowstoneRiver_states_WGS84.shp')
basin

In [None]:
ax = states.plot(edgecolor = 'black', color = 'darkgray')
basin.plot(ax = ax, edgecolor = 'darkblue', color = 'none')
river.plot(ax = ax, edgecolor = 'lightblue')

### Downloading IMERG-Final precipitation data

In [None]:
results = earthaccess.search_data(
    short_name = 'GPM_3IMERGM',
    temporal = ('2014-01-01', '2018-12-31'))

In [None]:
earthaccess.download(results, 'data_raw/IMERG-Final')

### Working with multiple HDF5 files

In [None]:
with h5py.File('data_raw/IMERG-Final/3B-MO.MS.MRG.3IMERG.20180701-S000000-E235959.07.V07B.HDF5', 'r') as hdf:
    longitude = hdf['Grid/lon'][:]
    latitude = hdf['Grid/lat'][:]
    print(longitude.shape)
    print(latitude.shape)
    print(hdf['Grid/precipitation'].shape)
    print(hdf['Grid/precipitation'].attrs['units'])

In [None]:
import glob
file_list = glob.glob('data_raw/IMERG-Final/*.HDF5')
file_list.sort()
len(file_list)

In [None]:
file_list[0].split('.')[4][0:8]

In [None]:
filename = file_list[0]

# We only care about the "precipitation" variable, but we want an xarray.Dataset,
#    so we include the name of the variable(s) we want as a list in get()
ds = xr.open_dataset(filename, group = 'Grid').get(['precipitation'])

# Optionally define the coordinates, if they are missing
# date = datetime.datetime.strptime(filename.split('.')[4][0:8], '%Y%m%d') # e.g., "20180101"
# ds = ds.assign_coords({
#     'time': [date], 'x': longitude, 'y': latitude
# })

ds.precipitation.plot(x = 'lon', vmax = 2)

### Spatial subsetting of an `xarray` Dataset

In [None]:
from pyproj import CRS

ds = ds.rio.write_crs(CRS.from_epsg(4326))
ds = ds.rio.set_spatial_dims(x_dim = 'lon', y_dim = 'lat')

In [None]:
ds_clip = ds.rio.clip(basin.geometry.values, basin.crs, from_disk = True)
ds_clip.precipitation

In [None]:
ds_clip.precipitation[0].plot(x = 'lon')

---

## Creating a data processing pipeline

In [None]:
datasets = []

for filename in file_list:
    date = datetime.datetime.strptime(filename.split('.')[4][0:8], '%Y%m%d')
    ds0 = xr.open_dataset(filename, group = 'Grid').get(['precipitation'])

    # Define the coordinate reference system (CRS) and the spatial coordinates
    ds0 = ds0.rio.write_crs(CRS.from_epsg(4326))
    ds0 = ds0.rio.set_spatial_dims('lon', 'lat')

    ds_clip = ds0.rio.clip(basin.geometry.values, basin.crs, from_disk = True)
    
    # Only write the file if it doesn't exist (in case we run this again)
    datasets.append(ds_clip)

ds = xr.concat(datasets, dim = 'time')
ds

In [None]:
ds.precipitation[0].plot(x = 'lon')

## Calculating total basin-wide precipitation

In [None]:
ds.precipitation.sum(['lon','lat'])

In [None]:
precip_series = ds.precipitation.sum(['lon','lat']).values
pyplot.plot(precip_series)

In [None]:
import calendar

calendar.mdays

In [None]:
days_in_month = np.array(calendar.mdays)[ds.coords['time.month'].values]
days_in_month

In [None]:
precip_total = precip_series * days_in_month