# Explore SWOT Sword of Science River Discharge data products

In [1]:
import datetime
import netCDF4
from netCDF4 import  chartostring
import pandas as pd
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import matplotlib.pyplot as plt
from pathlib import Path 

## Granule structure

The SWORD of Science (SoS) is a community-driven dataset produced for and from the execution of the Confluence workflow in the cloud which enables quick data access and compute on SWOT data. Data granules contain two files, priors and results. The priors file contains prior information, such as in-situ gage data and model output that is used to generate the discharge products. The results file contains the resulting river discharge data products.

The SoS is organized by continent following [SWOT River Database (SWORD)](https://www.swordexplorer.com/) structure and naming conventions. It is indexed on the same reach and node identifier dimensions found in SWORD. Time series data is stored by cycle and pass on an observation dimension.


More information is available in the SWOT-Confluence Github repository:
* [Documentation for priors](https://github.com/SWOT-Confluence/documentation/blob/main/documentation/confluence_data_description-priors.pdf)
* [Documentation for results](https://github.com/SWOT-Confluence/documentation/blob/main/documentation/confluence_data_description-results.pdf)



Results are organized into groups corresponding to modules in the SWOT-Confluence processing software. Modules are described in the [Confluence Module Documentation](https://github.com/SWOT-Confluence/documentation/blob/main/documentation/confluence_module_documentation_v1.0.pdf).

Start by opening the results SoS NetCDF file and investigating the available groups:

In [2]:
# Open the priors file:
PP=Path('C:/Users/coss.31/OneDrive - The Ohio State University/Documents/SWOT_Mission_REPOS/sos-notebooks/mnt/input/sos')
priors = netCDF4.Dataset(PP.joinpath("na_sword_v16_SOS_priors.nc"), format="NETCDF4")

# Open the results file
results = netCDF4.Dataset("./na_sword_v16_SOS_results.nc", format="NETCDF4")


In [4]:
results['momma']['Q']
dx=np.where(results['reaches/reach_id'][:]==74268600051)[0][0]
results['momma']['Q'][dx]



array([ 4.56145420e+01,  7.91084288e+01,  7.92677502e+00,  1.95523167e+01,
        3.72200558e+01, -1.00000000e+12,  2.40935614e+01,  3.46569227e+01,
        3.37415283e+00,  3.16012368e+00,  2.23264650e+00])

In [None]:
chartostring(results['validation/algo_names'][dx])


In [None]:
# Display the priors groups
print("Priors Groups:")
print(priors.groups)

In [None]:
# Display the module groups
print("Results Groups:")
print(results.groups)

## Plot river reach locations

Information about the spatial location of river reaches is in the reaches and nodes groups including river names. This data is taken directly from [SWOT River Database (SWORD)](https://www.swordexplorer.com/).

In [None]:
reaches = results.groups['reaches']
print("Reaches Group")
print(reaches, "\n")
print("Longitude")
print(reaches.variables['x'])

In [None]:
# Unpack the spatial coordinates and river names
reach_lon = results.groups['reaches'].variables['x']
reach_lat = results.groups['reaches'].variables['y']

river_names = results.groups['reaches'].variables['river_name']

# Filter data to only find the Mississippi River
idx = np.where(river_names[:] == 'Ohio River')

In [None]:
RIDS=results['reaches']['reach_id'][:].filled(np.nan)
RL=[74261000041, 74265000021, 74266300131, 74266300071, 74266700101,
       74266400031, 74266700051, 74266700141, 74266700201, 74266700171,
       74266400071, 74266400161, 74266700261, 74266400251, 74266800021,
       74266800071, 74266800031, 74266800131, 74267300111, 74266800151,
       74266400511, 74266900041, 74266900131, 74266400451, 74266400451,
       74266400451, 74266900081, 74266400671, 74266400591, 74266900151,
       74266400711, 74266400711, 74266900301, 74266900211, 74266400751,
       74266900491, 74266900511, 74265000121, 74267200031, 74267200121,
       74267200171, 74267200281, 74267300181, 74267300191, 74264700381,
       74267500041, 74267400131, 74267700141, 74264900071, 74269900761,
       74262700531, 74262700551, 74262800311, 74262800351, 74262800371,
       74267600061, 74267600091, 74267600101, 74267600121, 74267600221,
       74267600201, 74267600151, 74267600241, 74267800251, 74267800341,
       74267800201, 74267800051, 74267800151, 74267800141, 74267800091,
       74267800121, 74267800071, 74269400291, 74269400181, 74269400201,
       74269400041, 74269400331, 74269500051, 74269600111, 74269600101,
       74269600061, 74269600051, 74269700031, 74269900551, 74269800211,
       74269800111, 74269900431, 74269800051, 74269900071, 74269900191,
       74269900331, 74269900131, 74269900231, 74269900481, 74269900351,
       74269900271, 74269900811, 74269900521, 74269900701, 74262200241,
       74262200201, 74262200031, 74262200091, 74262200131, 74262200161,
       74264300091, 74262200271, 74262301081, 74262200321, 74262200361,
       74264300121, 74264400101, 74264600021, 74264600291, 74262800181,
       74262800281, 74262800251, 74268900281, 74268300081, 74268400011,
       74269200061, 74268400041, 74268500061, 74268600011, 74268700061,
       74268800021, 74268800051, 74269800381, 74268800131, 74269800481,
       74269800391]
#momma
RL=[74265000121,
    74266400251,74266400591,74266700141,74266800021,74266800131,74266800151,74267300181,74267600101,74267600151,74267600201
,74267800071
,74267800091
,74267800121
,74267800141
,74267800151
,74268600011
,74268800021
,74268900281
,74269400331
,74269600051
,74269800051
,74269800381
,74269900271
,74269900431
,74269900701]

#hivdi

RL=[74261000041,
74262200091
,74262200131
,74262200161
,74262200201
,74262200271
,74262200361
,74262301081
,74262800281
,74262800311
,74262800351
,74264300091
,74264300121
,74264600021
,74264600291
,74264700381
,74265000021
,74265000121
,74266300071
,74266300131
,74266400031
,74266400251
,74266400511
,74266400591
,74266700051
,74266700101
,74266700141
,74266700171
,74266700201
,74266800031
,74266900151
,74266900491
,74266900511
,74267300191
,74267400131
,74267600061
,74269600061]
indx=[]
for id in RL:
    indx.append(np.where(RIDS==id)[0][0])


In [None]:
idx=indx
idx

In [None]:
# Create the figure
fig = plt.figure(figsize=(10,10))

# Add map elements gridlines
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines()
ax.add_feature(cfeature.STATES, edgecolor='black')

gl = ax.gridlines(crs=ccrs.PlateCarree(), linewidth=1, color='black', alpha=0.5, linestyle='--', draw_labels=True)
gl.xlabels_top = False
gl.ylabels_left = True
gl.ylabels_right=False
gl.xlines = True

gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER

# Plot the river reach centerpoint locations
ax.scatter(reach_lon[idx], y=reach_lat[idx], color='c')

# Add the title
plt.title('Ohio River Reach Centerpoint Locations')

# Navigating Reaches and Nodes

The SoS is organized by continent following the conventions set in the [SWOT River Database](https://www.swordexplorer.com/) for the NetCDF file format. Reach identifiers can be found in the "reaches" group and node identifiers can be found in the "nodes" group. The following sections show you how to locate reaches and nodes by river name which allows you to index into the reach and/or node level data.

**How to locate reach and node identifiers by river name**

You can search for a river name using the same convention as used when plotting river reach locations to obtain the reach identifiers for that river. You can then use the reach identifiers to locate the nodes that belong to each reach for that river as the nodes are indexed on a different dimension (num_nodes) than reaches (num_reaches).

In [None]:
# Locate the indexes for the specific river you are interested in
river_names = results['reaches']['river_name'][:]
reach_idx = np.where(river_names[:] == 'Ohio River')

# Locate the reach identifiers for the river name
reach_identifiers = results['reaches']['reach_id'][reach_idx]

# Locate the reach identifiers of interest on the node-level
reach_node_identifiers = results['nodes']['reach_id'][:]
node_idx = []
for reach_identifier in reach_identifiers:
    node_idx.extend(np.where(reach_node_identifiers == reach_identifier)[0])

# Locate the node identifiers of interest using the reach identifiers to index
node_identifiers = results['nodes']['node_id'][:]
print(node_identifiers)

In [None]:
# Unpack the spatial coordinates on the node level and index to values of interest
node_lon = results['nodes']['x'][node_idx]
node_lat = results['nodes']['y'][node_idx]
print(node_lon.shape)
print(node_lat.shape)

In [None]:
# Create the figure
fig = plt.figure(figsize=(10,10))

# Add map elements gridlines
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines()
ax.add_feature(cfeature.STATES, edgecolor='black')

gl = ax.gridlines(crs=ccrs.PlateCarree(), linewidth=1, color='black', alpha=0.5, linestyle='--', draw_labels=True)
gl.xlabels_top = False
gl.ylabels_left = True
gl.ylabels_right=False
gl.xlines = True

gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER

# Plot the river reach centerpoint locations
ax.scatter(x=node_lon, y=node_lat)

# Add the title
plt.title('Ohio River Node Centerpoint Locations')

## Plot Discharge Timeseries

The main data of interest in the results files is the timeseries of river discharge (q) estimates produced by each module. The SoS is a global dataset organized by continents and not every reach will have an associated discharge for each module. So it is helpful to filter out missing values in order to isolate and visualize discharge for the various modules.

### How to locate data amongst missing values

You can use the `missing_value` NetCDF variable attribute to locate the value used to indicate missing data. You can then filter on that value to isolate the time steps with discharge estimates. The following example uses the HiVDI algorithm results to demonstrate filtering missing values and plotting discharge.

In [None]:
# Retrieve discharge from HiVDI group
hivdi_q = results['hivdi']['Q'][:]

# Save the missing value
missing = results['hivdi']['Q'].missing_value

# Loop through each reach and filter out places where the missing value is present
data_indexes = []
for i in range(hivdi_q.shape[0]):
    if hivdi_q[i].shape[0] > 1:
        if np.any(hivdi_q[i] != missing): data_indexes.append(i) # For multiple time steps with non-missing values
    if hivdi_q[i].shape[0] == 1 and hivdi_q[i] != missing: data_indexes.append(i)  # For one time step with non-missing value

# Display the numeric indexes where discharge data is present
print(data_indexes)

You can now use the data indexes to retrieve location, time, and river name data about the reaches that have discharge data.

In [None]:
# Review what river names are present in the data
print("River Names")
print(river_names[data_indexes])

ohio_indexes = np.where(river_names  == "Ohio River")
print("\nIndexes for the Ohio River")
print(ohio_indexes)

# Locate overlap
overlap_indexes = np.intersect1d(data_indexes, ohio_indexes)
print("\nOverlapping indexes for the Ohio River with HiVDI Discharge data")
print(overlap_indexes)

In [None]:
overlap_indexes

In [None]:
# Select the first reach in the Ohio River from the overlapping indexes
data_index = overlap_indexes[5]

# Locate the reach identifier
reach_id = reaches['reach_id'][data_index]
print(f"Ohio reach identifier to plot: {reach_id}")

# Retrieve discharge
ohio_q = hivdi_q[data_index]
print(f"\nDischarge for Ohio reach identfier # {reach_id}")
print(ohio_q)

# Retrieve time
time = results['reaches']['time'][data_index]
print(f"\nTime for Ohio reach identfier # {reach_id}")
print(results['reaches']['time'][data_index])

In [None]:
# Transform time to correct format
swot_ts = datetime.datetime(2000,1,1,0,0,0)
missing_time = results['reaches']['time'].missing_value
ohio_time_str = []
for t in time:
    if t == missing_time: 
        skipthis=1
        #ohio_time_str.append('NO_DATA')
    else:
        ohio_time_str.append((swot_ts + datetime.timedelta(seconds=t)).strftime('%Y-%m-%d'))
        
ohio_time_str

In [None]:
# Filter any missing values out of reach identifier discharge and time
ohio_index = np.where(ohio_q != missing)

ohio_q_data = ohio_q[ohio_index]
print(f"Discharge for Ohio reach identfier # {reach_id}")
print(ohio_q_data)

ohio_time_data = time[ohio_index]
print(f"\nTime for Ohio reach identfier # {reach_id}")
print(ohio_time_data)


In [None]:
# Plot HiVDI Discharge for the Platte River Reach Identifier

# Set up plot
fig = plt.figure(figsize=(10,5))
ax1 = plt.subplot(311)

# Plot data
ax1.scatter(ohio_time_str, ohio_q_data)
ax1.plot(ohio_time_str, ohio_q_data)

# Define labels and title
ax1.set_ylabel('Discharge')
ax1.set_xlabel('Time')

plt.suptitle(f"Discharge Timeseries from HIVDI for the Ohio River reach identifier: {reach_id}.")


# Plotting integrator results for comparison

The SoS contains reach-level Flow Law Parameter (FLPE) algorithms: HiVDI, neoBAM, MetroMan, MOMMA, SAD, SIC4DVar that produce discharge estimates using SWOT observations, SoS Priors and SWORD data. It can be helpful to compare the reach-level FLPEs to the discharge values produced by the Mean Optimization Integrator (MOI). The MOI takes SWOT observation data and reach-level FLPE output and integrates the results. It uses river topology to force mass conservation and also defined uncertainty. 

In [None]:
# Locate MOI discharge results for HiVDI making sure to filter out missing values
moi_hivdi_q = results["moi"]["hivdi"]["q"][data_index]
moi_hivdi_q = moi_hivdi_q[ohio_index]

print(f"HiVDI MOI Discharge for Ohio reach identfier # {reach_id}")
print(moi_hivdi_q)

In [None]:
# Plot HiVDI discharge alongside MOI discharge

fig = plt.figure(figsize=(10,5))

# HiVDI Q
ax1 = plt.subplot(311)
ax1.scatter(ohio_time_str, ohio_q_data)
ax1.plot(ohio_time_str, ohio_q_data)

# MOI Q
ax2 = plt.subplot(312)
ax2.scatter(ohio_time_str, moi_hivdi_q)
ax2.plot(ohio_time_str, moi_hivdi_q)

# Define labels and title
ax1.set_ylabel('Discharge')
ax2.set_xlabel('Time')

ax1.set_title("HiVDI Discharge")
ax2.set_title("MOI Discharge for HiVDI")

plt.suptitle(f"Discharge Timeseries from HIVDI for the Platte River reach identifier: {reach_id}.")
plt.tight_layout()


# Table of Modules (Algorithms) and Discharge variables

The following lists the algorithms alongside their discharge variables and location in the SoS assuming that the SoS is an open file represented by the `results` variable.

| Module (Algorithm)               | Discharge Variable               | Location in the SoS              |
|----------------------------------|----------------------------------|----------------------------------|
| HiVDI                            | Q                                | results["hivdi"]["Q"]            |
| MetroMan                         | allq                             | results["metroman"]["allq"]      |
| MOMMA                            | Q_constrained                    | results["momma"]["Q_constrained"]|
| neoBAM                           | q1, q2, or q3                    | results["neobam"]["q"]["q1"]     |
| SAD                              | Qa                               | results["sad"]["Qa"]             |
| SIC4DVar                         | ?                                | results["sic4dvar"]["?"]         |
| MOI HiVDI                        | q                                | results["moi"]["hivdi"]["q"]     |
| MOI MetroMan                     | q                                | results["moi"]["metroman"]["q"]  |
| MOI MOMMA                        | q                                | results["moi"]["momma"]["q"]     |
| MOI neoBAM                       | q                                | results["moi"]["qeobam"]["q"]    |
| MOI SAD                          | q                                | results["moi"]["sad"]["q"]       |
| MOI SIC4DVar                     | q                                | results["moi"]["sic4dvar"]["q"]  |