# Lab 15: Data assimilation with remote sensing

**Purpose:** The purpose of this lab is to familiarize students with using remote sensing/geospatial data for setting up a hydrologic model as well as assimilating remote sensing data into the model

In [None]:
# connect Google Drive so we can use exported data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install geemap package for visualizing ee results
!pip install geemap filterpy HydroErr &> install.log

In [None]:
!pip install git+https://github.com/KMarkert/sacsma.git -q

In [None]:
%matplotlib inline

In [None]:
# import ee api and geemap package
import ee
import math
import geemap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geemap import colormaps as cmaps

In [None]:
# try to initalize an ee session
# if not authenticated then run auth workflow and initialize
try:
    ee.Initialize()
except:
    ee.Authenticate()
    ee.Initialize()

## Forcing data for model


USGS Streamflow data: https://waterdata.usgs.gov/nwis/inventory?agency_code=USGS&site_no=10153100

NCRS SNOWTEL data: https://wcc.sc.egov.usda.gov/nwcc/site?sitenum=1223 

In [None]:
# change to your prefered study period
# must be within the time range of available observed data
START_TIME = '2009-01-01'
END_TIME = '2021-01-01'

In [None]:
# specify where the gauge is located so we can filter the basin by location
gauge_lat,gauge_lon = 40.179, -111.639
gauge_pt = ee.Geometry.Point([gauge_lon,gauge_lat])

In [None]:
# load in the watershed feature collection
watersheds = ee.FeatureCollection("USGS/WBD/2017/HUC10")

# filter by gauge location
hobble_creek = ee.Feature(watersheds.filterBounds(gauge_pt).first())

In [None]:
Map = geemap.Map()

Map.centerObject(hobble_creek)

Map.addLayer(hobble_creek,{},"Hobble Creek Basin")
Map.addLayer(gauge_pt,{"color":"yellow",},"Gauge")

Map.addLayerControl()
Map

In [None]:
# specify band names we want
metBands = ['prcp','tmin','tmax',]

daymet_col = ee.ImageCollection("NASA/ORNL/DAYMET_V4")

# filter the collection by date and select the bands on interest
met_col = (
    daymet_col
    .filterDate(START_TIME, ee.Date(END_TIME).advance(1,'day'))\
    .select(metBands)
)


In [None]:
# define a function to calculated time series for the basin
def get_timeseries(img):
    results = img.reduceRegion(
        reducer = ee.Reducer.mean(),
        geometry = hobble_creek.geometry(1e4),
        scale = img.select([0]).projection().nominalScale()
    )

    return img.set(results)

In [None]:
# get a time series of meteorological data
met_col_timeseries = met_col.map(get_timeseries)

In [None]:
# get the average elevation for the basin
elv = get_timeseries(ee.Image("NASA/NASADEM_HGT/001"))

elv_avg = elv.get("elevation")

In [None]:
elv_avg.getInfo()

In [None]:
# define a function to convert the images to features
def img_to_feature(img):
    img = img.set("date",img.date().format("YYYY-MM-dd"))
    geo = hobble_creek.centroid(1e4).set("elev",elv_avg)
    return geo.copyProperties(img)


In [None]:
# convert images to features
timeseries_table = met_col_timeseries.map(img_to_feature)

In [None]:
timeseries_table.first().getInfo()

In [None]:
# run task for met data
task = ee.batch.Export.table.toDrive(
    collection = timeseries_table,
    description = "meterological_timeseries_export",
    fileNamePrefix= "hobble_creek_met",
    folder = "hobble_creek_model",
    fileFormat = "CSV"
)

task.start()

## State data for model

In [None]:
soilmoisture = ee.ImageCollection("NASA_USDA/HSL/SMAP10KM_soil_moisture").filterDate("2016-01-01",END_TIME)
swe = daymet_col.select("swe").filterDate("2016-01-01",END_TIME)

In [None]:
# Define an allowable time difference: ten days in milliseconds.
half_day_millis = 24 * 60 * 60 * 1000

# Create a time filter to define a match as overlapping timestamps.
time_filter = ee.Filter.Or(
    # use max difference filter to specify only one day difference
    # checks one day on either side of observation
    ee.Filter.maxDifference(
        difference= half_day_millis,
        leftField= 'system:time_start',
        rightField= 'system:time_start'
    )
);

In [None]:
# Define the join.
# this is "saveBest" which will give us the image closest in time to what we want
state_join = ee.Join.saveBest(
  matchKey= 'swe', # this will be the name of the result in the collection
  measureKey= 'timeDiff'
)

In [None]:
# Apply the join.
# uses soil_moisture as the collection to join to and applies filter on surface reflectance data
joined_states = ee.ImageCollection(state_join.apply(soilmoisture, swe, time_filter))

In [None]:
# define a function to unpack the joined properties
def unpack_join(img):
    return img.addBands(img.get("swe"))

state_imgs = joined_states.map(unpack_join)

In [None]:
# get the time series 
state_timeseries = state_imgs.map(get_timeseries)

In [None]:
# convert images to faeture collection
state_table = state_timeseries.map(img_to_feature)

In [None]:
state_table.first().getInfo()

In [None]:
# run task to export state information
task = ee.batch.Export.table.toDrive(
    collection = state_table,
    description = "state_timeseries_export",
    fileNamePrefix= "hobble_creek_state",
    folder = "hobble_creek_model",
    fileFormat = "CSV"
)

task.start()

In [None]:
ee.batch.Task.list()[:2]

## Model setup

In [None]:
from sacsma.simulations import Simulation
import filterpy.kalman as kf

In [None]:
# read in the meteorological forcing data
forcings_df = pd.read_csv("/content/drive/MyDrive/hobble_creek_model/hobble_creek_met.csv",index_col="date")
forcings_df.index = forcings_df.index.astype('datetime64[ns]')


In [None]:
# read in the state data from remote sensing data
state_df = pd.read_csv("/content/drive/MyDrive/hobble_creek_model/hobble_creek_state.csv",index_col="date")
state_df.index = state_df.index.astype('datetime64[ns]')

In [None]:
# read in the observed streamflow data
obs = pd.read_csv("/content/drive/MyDrive/hobble_creek_model/USGS_10153100_streamflow.csv",index_col="datetime")
obs.index = obs.index.astype('datetime64[ns]')


In [None]:
# plot the forcing data
axs = forcings_df[["prcp","tmin","tmax"]].plot(figsize=(10,7),subplots=True);
plt.show();

In [None]:
# extract out the two time periods from forcing dataset
forcings_cal = forcings_df.loc[forcings_df.index < "2016-01-01"]
forcings_assim = forcings_df.loc[forcings_df.index >= "2016-01-01"]

In [None]:
# extract out the two time periods from observed dataset
obs_cal = obs.loc[obs.index < "2016-01-01"]
obs_assim = obs.loc[obs.index >= "2016-01-01"]

Next we need to define the model parameters we are going to use. Each part of the model has their own set of parameters: 1) the snow model, 2) the land surface model, and 3) the routing model.

An important concept of modeling is calibration to get the parameter right. As simplified calibration was completed to get an initial guess at parameters for our basin.

In [None]:
# snow model parameters
snow_pars = np.array([
    1.46863,    # snow correction factor
    1.37133,    # max of the seasonally varying non-rain melt factor
    0.508376,   # min of the seasonally varying non-rain melt factor
    0.0656106,  # average wind function during rain-on-snow events
    1.24992,    # temperature threshold for snow vs rain
    0.206033,   # negative melt factor
    0.165462,   # used to compute an antecedent temperature index
    -0.438637,  # the base temperature used to determine the temperature gradient for non-rain melt computations
    0.531435,   # controls the maximum amount of liquid water that can be retained within the snow cover (decimal fraction)
    0.411856,   # controls the amount of melt per day that occurs at the snow-soil interface [1/day]
])

In [None]:
# land surface model parameters
ls_pars = np.array([
    51.1675,    # Upper zone tension water capacity [mm]
    63.9568,    # Upper zone free water capacity [mm]
    234.478,    # Lower zone tension water capacity [mm]
    200.876,    # Lower zone primary free water capacity [mm]
    61.6775,    # Lower zone supplementary free water capacity [mm]
    0.107242,   # Additional impervious areas (decimal fraction)
    0.351116,   # Upper zone free water lateral depletion rate [1/day]
    0.00537984, # Lower zone primary free water depletion rate [1/day]
    0.0623334,  # Lower zone supplementary free water depletion rate [1/day]
    138.602,    # Percolation demand scale parameter [-]
    2.95144,    # Percolation demand shape parameter [-]
    0.0374366,  # Impervious fraction of the watershed area (decimal fraction)
    0.300625,   # Percolating water split parameter (decimal fraction)
    0.0841843,  # Riparian vegetation area (decimal fraction)
    0.274862,   # The ratio of deep recharge to channel base flow [-]
    0.447683,   # Fraction of lower zone free water not transferrable (decimal fraction)
])

In [None]:
# routing model parameters
routing_pars = np.array([
    13.1234,    # Unit Hydrograph shape parameter
    19.7153,    # Unit Hydrograph scale parameter
    3.11245,    # wave velocity in the linearized Saint-Venant equation(m/s)
    1004.67,    # diffusivity in the linearized Saint-Venant equation(m2/s)
])


## Running the model

In [None]:
# instantiate a model run using the forcings from calibration period
model_init = Simulation(forcings_cal,forcings_df["elev"][0],snow_pars,ls_pars,routing_pars)

In [None]:
# execute the model to get discharge
# this runs the snow, land surface, and routing model
q = model_init.execute()

In [None]:
# get the date information as arrays
cal_dates = forcings_cal.index.values.astype(np.datetime64)
assim_dates = forcings_assim.index.values.astype(np.datetime64)

In [None]:
# create a dataframe of the simulated values
sim_df = pd.DataFrame({"simulated":q},index=cal_dates)

In [None]:
# join the dataframes together to align dates
joined = pd.concat([sim_df,obs_cal["discharge"]], axis=1)
joined = joined.loc[(joined.index >= "2010-01-01") & (joined.index < "2016-01-01")].dropna()

In [None]:
# plot the observed vs simulated
ax = joined.plot(figsize=(10,7))
ax.set_ylabel("Discharge [cms]")
plt.show();

In [None]:
from HydroErr import nse

nse_cal = nse(joined["simulated"],joined["discharge"])

print(f"NSE: {nse_cal:.4f}")

## One Dimensional Kalman Filter

In [None]:
# extract out soil moisture and swe values from previous run
sm = model_init.sm
we = model_init.we
swe = np.sum(we,axis=0)

# extract out soil moisture and swe values from observed
sm_obs = state_df["ssm"].values
swe_obs = state_df["swe"].values

In [None]:
# calculate variances of the simulated and observed variables
# note: simulated variances are from previous model run
#       and the observed variances are from observed period
sm_P = np.var(sm)
swe_P = np.var(swe)

sm_R = np.var(sm_obs)
swe_R = np.var(swe_obs)

In [None]:
# instantiate another model that we will use to assimilate the data into
model_assim = Simulation(
    forcings_assim,
    forcings_df["elev"][0],
    snow_pars,
    ls_pars,
    routing_pars, 
    ls_state=model_init.ls_state, 
    snow_state=model_init.snow_state
)

In [None]:
# get the number of time steps
n_steps = range(model_assim.n)

In [None]:
# manually step through each model iteration
# if there are data from the observations, then we will 
# update the state using the Kalman Filter
for i in n_steps:
    # run the model time step
    model_assim.step(i)

    if model_assim.dates[i] in state_df.index:
        idx = np.squeeze(np.where(state_df.index.astype('datetime64[ns]') == model_assim.dates[i]))

        model_assim.ls_state[0], sm_p = kf.update(x=model_assim.ls_state[0], P=sm_P, z=sm_obs[idx], R=sm_R)
 
        we = np.array([model_assim.snow_state[0],model_assim.snow_state[2]])
        x_swe,swe_p = kf.update(x=np.sum(we), P=swe_P, z=swe_obs[idx], R=swe_R)

        if np.sum(we)>0:
            swe_weights = (we/np.sum(we))
        else:
            swe_weights = np.array([0.5,0.5])

        model_assim.snow_state[0],model_assim.snow_state[2] = x_swe * swe_weights

In [None]:
from sacsma import routing

In [None]:
# extract out the runoff components from the LS model
assim_runoff = model_assim.runoffs

In [None]:
# run the routing model
flowlength = 71634.0
assim_direct,assim_base = routing.lohmann(assim_runoff[1,:],assim_runoff[2,:], flowlength, routing_pars)
assim_q = assim_direct + assim_base

In [None]:
# setup an open loop simulation to compare results
model_openloop = Simulation(
    forcings_assim,
    forcings_df["elev"][0],
    snow_pars,
    ls_pars,
    routing_pars, 
    ls_state=model_init.ls_state, 
    snow_state=model_init.snow_state
)

In [None]:
# run the open loop simulation
openloop_q = model_openloop.execute()

In [None]:
# display snow water equivalent from assimilated run, open loop run, and observed
f,ax = plt.subplots(figsize=(10,7))
ax.plot(assim_dates,np.sum(model_assim.we,axis=0),label = "Assimilation")
ax.plot(assim_dates,np.sum(model_openloop.we,axis=0), label = "Open Loop")
state_df["swe"].plot(label = "Observed")
ax.set_ylabel("SWE [mm]")
ax.legend()
plt.show()

In [None]:
# display top layer soil moisture from assimilated run, open loop run, and observed
f,ax = plt.subplots(figsize=(10,7))
ax.plot(assim_dates,model_assim.sm, label = "Assimilation")
ax.plot(assim_dates,model_openloop.sm, label = "Open Loop")
state_df["ssm"].plot(label = "Observed")
ax.set_ylabel("Soil moisture [mm]")
ax.legend()
plt.show()

In [None]:
# display streamflow from assimilated run, open loop run, and observed
f,ax = plt.subplots(figsize=(10,7))

ax.plot(assim_dates, assim_q, label = "Assimilation")
ax.plot(assim_dates, openloop_q, label = "Open Loop")
obs_assim["discharge"].plot(label = "Observed")
ax.legend()
ax.set_ylabel("Discharge [cms]")
plt.show()

In [None]:
# create a dataframe of the simulated values
sim_df = pd.DataFrame({"assimilated":assim_q, "openloop": openloop_q},index=assim_dates) 

In [None]:
# join the dataframes together to align dates
joined = pd.concat([sim_df,obs_assim["discharge"]], axis=1)
joined = joined.loc[joined.index >= "2016-01-01"].dropna()

In [None]:
# evaluate the accuracy
nse_assim = nse(joined["assimilated"],joined["discharge"])
nse_openloop = nse(joined["openloop"],joined["discharge"])

print(f"Assimilated NSE: {nse_assim:.4f}")
print(f"Open Loop NSE: {nse_openloop:.4f}")

As we can see, the data assimilation process improved our results (although not as much as we would like...). In reality the network is probably very regulated and any amount of calibration/assimilation cannot approve results and information on interventions can help improve results. Nevertheless, this illustrates the process of data assimilation for hydrology modeling.