In [None]:
import dabench as dab
import matplotlib.pyplot as plt
import numpy as np

# 1. Basic stationary observer with random sampling by *count*

The first step is to define a data generator and generate/load data for the observer. Let's use a Lorenz63 generator as a basic starting point.

In [None]:
l63 = dab.data.Lorenz63()
l63.generate(n_steps=50)

In [None]:
# Now we can define the observer:
obs_l63 = dab.observer.Observer(
    l63, # Data generator object
    random_time_count = 20, # Pick 20 timesteps for sampling
    random_location_count = 1, # Pick one location in the system for sampling
    error_bias = 0.0, # Mean for observation error, Gaussian/Normal distribution
    error_sd = 0.7, # Standard deviation for observation error, Gaussian/Normal distribution
    random_seed=99 # We can specify a random seed. Default is 99
)

# Making observations
obs_vec_l63 = obs_l63.observe()

In [None]:
# Let's examine that object
print('Sampling times: ', obs_vec_l63.times)
print('Number of observations: ', obs_vec_l63.num_obs)
print('Number of locations at each timestep: ', obs_vec_l63.obs_dims[0])
print('Sampling location indices: ', obs_vec_l63.location_indices[0])
print('Observation values: ', obs_vec_l63.values)

In [None]:
# Let's examine how error is added to observations
fig, ax = plt.subplots()
ax.plot(l63.times, l63.values[:, 1], alpha=0.9)
ax.plot(obs_vec_l63.times, obs_vec_l63.values[:, 0], '--', alpha=0.9)
obs_values_minus_error = obs_vec_l63.values - obs_vec_l63.errors
ax.plot(obs_vec_l63.times, obs_values_minus_error[:, 0], ':', alpha=0.9)
ax.legend(labels=['Original System', 'Observations with Error', 'Obs Without Error'])
plt.show()

# 2. Stationary observer with user-specified sampling times and locations

Last time, we let the observer randomly select locations and times to sample. But the observer also allows us to specify the location and time indices we want to observe. You can use this to, for example, sample every other time step or every 5th element in the state vector. It allows for complete customization. Let's explore that using a Lorenz96 generator.

In [None]:
l96 = dab.data.Lorenz96()
l96.generate(n_steps=100)
print('Time dim: ', l96.time_dim)
print('System dim: ', l96.system_dim)

In [None]:
# Let's sample 5 different locations in the system every 5th timestep
time_inds_l96 = np.arange(0, 100, 5)
print(time_inds_l96)
sys_inds_l96 = [5, 10, 20, 25, 35]

In [None]:
# Set up observer using our specified sampling times/locations
obs_l96 = dab.observer.Observer(
    l96, 
    time_indices = time_inds_l96, # Time indices to sample
    location_indices = sys_inds_l96, # Location indices to sample
    error_bias = 0.2,
    error_sd = 0.5 
)

# Making observations
obs_vec_l96 = obs_l96.observe()

In [None]:
# Let's examine that object
print('Sampling times: ', obs_vec_l96.times)
print('Number of observations: ', obs_vec_l96.num_obs)
print('Number of locations at each timestep: ', obs_vec_l96.obs_dims[0])
print('Sampling location indices: ', obs_vec_l96.location_indices[0])
print('Observation values: ', obs_vec_l96.values)
print('Mean Error: ', obs_vec_l96.errors.mean()) # Pretty close to the 0.2 we specified

In [None]:
# Plot observations against the original values
fig, ax = plt.subplots()
ax.plot(l96.times, l96.values[:, obs_vec_l96.location_indices[0, 0]], alpha=0.9)
ax.plot(obs_vec_l96.times, obs_vec_l96.values[:, 0], '--', alpha=0.9)
ax.legend(labels=['Original System', 'Observations with Error'])
plt.show()

# 3. Filtering observations by time

In some cases, once we've generated observations we might want to select all observations within a certiain time interval. Let's use the Lorenz96 observations we generated to show an example of this.

In [None]:
# Specify time interval, centered at 2 +/- 0.75
time_start = 2 - 0.75
time_end = 2 + 0.75
# Run filter and save as new obs vec
obs_vec_l96_filt = obs_vec_l96.filter_times(start=time_start, end=time_end, inclusive=True)
print('Observation times: ', obs_vec_l96_filt.times)
print('New number of obs: ', obs_vec_l96_filt.num_obs)

In [None]:
# Visualize
# Plot it against the original values
fig, ax = plt.subplots()
ax.plot(l96.times, l96.values[:, obs_vec_l96.location_indices[0, 0]], alpha=0.9)
ax.plot(obs_vec_l96.times, obs_vec_l96.values[:, 0], '--', alpha=0.9)
ax.plot(obs_vec_l96_filt.times, obs_vec_l96_filt.values[:, 0], '-.', alpha=1.0)
ax.legend(labels=['Original System', 'All Obs', 'Time-range Filtered Obs'])
plt.show()

# 4. Observer with locations in original coordinate dimensions

NOTE: These are currently not working

In the previous example, we specified locations to sample in the flattened, 1D space of the system's state vector. But for many data generators/loaders, the values originally exist in multi-dimensional space (e.g. latitudue, longitude, vertical level) before being flattened into a state vector. DataAssimBench's Observer class can take location indices in this original_dim instead. Let's create observations from some ERA5 data downloaded from Amazon Web Services. 

In [None]:
aws = dab.data.AWS()
aws.load()
print('Time dim: ', aws.time_dim)
print('System dim: ',aws.system_dim)
print('Original dim: ', aws.original_dim)

In [None]:
# Let's sample every 500 timesteps
time_inds_aws = np.arange(0, aws.time_dim, 500)
# Let's pick indices at the corners and roughly center of the system
loc_inds_aws= np.array([[0, 0], [12, 0], [12, 42], [0, 42], [6, 21]])

In [None]:
# Set up observer using our specified sampling times/locations
obs_aws = dab.observer.Observer(
    aws, 
    time_indices = time_inds_aws, # Time indices to sample
    location_indices = loc_inds_aws, # Location indices to sample
    error_bias = 0.0, # No error this time
    error_sd = 0.0
)

# Making observations
obs_vec_aws = obs_aws.observe()

In [None]:
# Let's examine that object
print('Sampling times: ', obs_vec_aws.times)
print('Number of observations: ', obs_vec_aws.num_obs)
print('Number of locations at each timestep: ', obs_vec_aws.obs_dims[0])
print('Sampling location indices: ', obs_vec_aws.location_indices[0])
print('Observation values: ', obs_vec_aws.values)
print('Errors: ', obs_vec_aws.errors)

In [None]:
# Visualize
# Recall that there is no error, but our sampling is pretty infrequent compared to the actual system
# Plot it against the original values
fig, ax = plt.subplots()
ax.plot(aws.times, aws.values_gridded[:, 0, 0], alpha=0.7)
ax.plot(obs_vec_aws.times, obs_vec_aws.values[:, 0], '--', alpha=1.0)
ax.legend(labels=['Original System', 'Observations'])
plt.show()

In [None]:
# Even though these times are datetimes, we can filter our observations by time
# by using np.datetime objects.
# Let's get all observations between June 1st and September 1st:
time_start = np.datetime64('2020-06-01')
time_end = np.datetime64('2020-09-01')
# Run filter and save as new obs vec
obs_vec_aws_filt = obs_vec_aws.filter_times(start=time_start, end=time_end, inclusive=True)
print('Observation times: ', obs_vec_aws_filt.times)
print('New number of obs: ', obs_vec_aws_filt.num_obs)

# 5. Non-Stationary Observer

NOTE: These are currently not working

In most cases, it's simplest to assume the observers are stationary and that we are sampling at the same location at each timestep. However, Observer allows for non-stationary observers as well, sampling different locations over time. We'll create a new set of observations from the AWS data as an example.

In [None]:
# Set up observer using our specified sampling times/locations
obs_aws_ns = dab.observer.Observer(
    aws, 
    time_density = 0.002,
    location_density = 0.05,
    error_bias = 0.0,
    error_sd = 3.0,
    stationary_observers=False
)

# Making observations
obs_vec_aws_ns = obs_aws_ns.observe()

In [None]:
print('Sampling times: ', obs_vec_aws_ns.times)
print('Number of observations: ', obs_vec_aws_ns.num_obs)
print('Number of locations at each timestep: ', obs_vec_aws_ns.obs_dims)
print('Sampling location indices at first timestep: ', obs_vec_aws_ns.location_indices[0])
print('Sampling location indices at last timestep: ', obs_vec_aws_ns.location_indices[-1])


# 6. Spectral Models

SQGTurb is a data generator that operates in spectral space, and so their state vector stores complex numbers with real and imaginary components. Fortunately, you can transform the data back into real space using an inverse Fourier Transform. The observer will handle this operation for you automatically, and so SQGTurb can be used with the observer in the same way as the other data generators. The main difference is that location_indices will have multiple indices per timestep, since they're specified in the original gridded dimension instead of the flattened state vector.

In [None]:
sqgturb = dab.data.SQGTurb()
sqgturb.generate(n_steps=50)
print('Complex state vector length: ', sqgturb.system_dim)
print('Original gridded dimension in real space: ', sqgturb.original_dim)

In [None]:
# Set up observer 
obs_sqg = dab.observer.Observer(
    sqgturb, 
    random_time_count = 50,
    random_location_count = 5,
    error_bias = 0.0,
    error_sd = 100.,
    stationary_observers=True
)
obs_vec_sqg = obs_sqg.observe()

In [None]:
print('Sampling times: ', obs_vec_sqg.times)
print('Number of observations: ', obs_vec_sqg.num_obs)
print('Number of locations at each timestep: ', obs_vec_sqg.obs_dims[0])
print('Sampling location indices: ', obs_vec_sqg.location_indices[0])

In [None]:
# Let's get the indices of the second sampled location:
print(obs_vec_sqg.location_indices[0, 2])

In [None]:
# Visualize
fig, ax = plt.subplots()
ax.plot(sqgturb.times, sqgturb.values_gridded[:, 1, 52, 79], alpha=0.9)
ax.plot(obs_vec_sqg.times, obs_vec_sqg.values[:, 2], '--', alpha=0.9)
ax.legend(labels=['Original System', 'Observations'])
plt.show()

# 7. Basic stationary observer with random sampling *by probability*

If you'd prefer, you can also specify a probability that each timestep or location will be sampled using random_time_density and random_location_density. For example, if you specify random_time_density = 0.5, approximately 50% of time steps will be sampled (with the proability of each time step being selected for sampling following a Bernoulli distribution with p = random_time_density). All of the examples above can be modified to use this method instead, although the exact number of times and locations sampled will vary.

NOTE: If used with stationary_observer=False, random_location_density will sample a DIFFERENT number of locations at each time step. For example, with system_dim=10 and random_location_density=0.5, it might sample 5 locations at the first timestep, 6 in the next, then 5 again, then 3, etc. It randomly selects locations at each timestep.

In [None]:
l63 = dab.data.Lorenz63()
l63.generate(n_steps=50)

In [None]:
obs_l63_p = dab.observer.Observer(
    l63, 
    random_time_density = 0.5, # Probability of picking each time step for random sampling
    random_location_density = 0.3, # Probability of picking each location in l63.system_dim for random sampling
    error_bias = 0.1,
    error_sd = 1.33
)

# Making observations
obs_vec_l63_p = obs_l63_p.observe()

In [None]:
# Let's examine that object
print('Sampling times: ', obs_vec_l63_p.times) # 28 out of 50 timesteps are sampled
print('Number of observations: ', obs_vec_l63_p.num_obs)
# In this case, 2 values (out of a total system_dim of 3) are observed at each timestep.
print('Number of locations at each timestep: ', obs_vec_l63_p.obs_dims[0])
print('Sampling location indices: ', obs_vec_l63_p.location_indices[0])
print('Observation values: ', obs_vec_l63_p.values)
print('Errors: ', obs_vec_l63_p.errors)