This notebook is inteded for exploring why certain data are identified by pyglider's find_gaps. 

This notebook was to inform https://github.com/c-proof/pyglider/issues/225

In [None]:
import os
import dbdreader
import numpy as np
import pandas as pd
import xarray as xr

from esdglider import gcp, glider
import pyglider.utils as pgutils

# deployment_info = {
#     "deployment_name": "calanus-20241019",
#     "mode": "delayed",
# }
# deployment_info = {
#     "deployment_name": "amlr03-20231128",
#     "mode": "delayed",
# }
deployment_info = {
    "deployment_name": "amlr08-20220513",
    "mode": "delayed",
}
maxgap = 60

# Standard 
deployment_name = deployment_info["deployment_name"]
mode = deployment_info["mode"]
bucket_name = 'amlr-gliders-deployments-dev'
deployments_path = f"/home/sam_woodman_noaa_gov/{bucket_name}"
config_path = f"/home/sam_woodman_noaa_gov/glider-lab/deployment-configs"

gcp.gcs_mount_bucket("amlr-gliders-deployments-dev", deployments_path, ro=False)
deployment_info["deploymentyaml"] = os.path.join(
    config_path, 
    f"{deployment_info["deployment_name"]}.yml", 
)

paths = glider.get_path_glider(deployment_info, deployments_path)
print(paths["binarydir"])

Read in data with dbdreader, mirroring binary_to_timeseries

In [None]:
dbd = dbdreader.MultiDBD(
    pattern=f'{paths["binarydir"]}/{"*.[D|E|d|e][Bb][Dd]"}',
    cacheDir=paths["cacdir"]
)
thenames = [
    'latitude', 'longitude', 'conductivity', 'temperature', 'pressure', 
    'chlorophyll', 'cdom', 'backscatter_700', 
    'oxygen_concentration', 'oxygen_saturation', 
    'heading', 'pitch', 'roll', 'waypoint_latitude', 'waypoint_longitude', 
    'water_velocity_eastward', 'water_velocity_northward'
]
sensors = [
    'sci_water_temp', 'm_lat', 'm_lon', 'sci_water_cond', 'sci_water_pressure', 
    'sci_flbbcd_chlor_units', 'sci_flbbcd_cdom_units', 'sci_flbbcd_bb_units', 
    'sci_oxy4_oxygen', 'sci_oxy4_saturation', 
    'm_heading', 'm_pitch', 'm_roll', 'c_wpt_lat', 'c_wpt_lon', 
    'm_final_water_vx', 'm_final_water_vy'
]


# get the data, with `time_base` as the time source that
# all other variables are synced to:
data = list(dbd.get_sync(*sensors))
# get the time:
time = data.pop(0)

Extract the info for the 'selected' sensor

In [None]:
name = "chlorophyll"
nn = np.argwhere([name in s for s in thenames])[0][0]
print(nn)
print(sensors[nn])

In [None]:
val = data[nn]
_t, _ = dbd.get(sensors[nn])
tg_ind = pgutils.find_gaps(_t, time, maxgap)
# val[tg_ind] = np.nan
# _log.debug('%s values changed to nan by maxgap', np.count_nonzero(tg_ind))
np.count_nonzero(tg_ind)

Useful displays

In [None]:
timens = (time * 1e9).astype('datetime64[ns]')
_tns = (_t * 1e9).astype('datetime64[ns]')
df = pd.DataFrame({"time": timens, "val": val, "gap": tg_ind})
df

In [None]:
sens_df = pd.DataFrame({"time": _tns, "val": _})
display(sens_df)

sens_df_sort = sens_df.sort_values("time", na_position="last").reset_index(drop=True)
sens_df_sort
# sens_df_sort = sens_df[np.argsort(sens_df["time"].values)]
# sens_df_sort

Run find_gaps experiments

In [None]:
# def find_gaps(sample_time, timebase, maxgap):
sample_time = _t
timebase = time

sample_time = np.sort(sample_time) #smw new line

# figure out which sample each time in time base belongs to:
time_index = np.searchsorted(sample_time, timebase, side='right')
time_index = np.clip(time_index, 0, len(sample_time) - 1)

# figure out the space between sample pairs
dt = np.concatenate(([0], np.diff(sample_time)))
# get the gap size for each timebase data point:
ddt = dt[time_index]

# get the indices of timebase that are too large and account for the
# degenerate case when a timebase point falls directly on a sample time.
index = ~np.logical_or((ddt <= maxgap), (np.isin(timebase, sample_time)))
print(np.count_nonzero(index))

# Sanity check
df["time_index"] = time_index
df["index_gap"] = index
print(np.where(df.gap != df.index_gap)) #np.count_nonzero(df.gap != df.index_gap)
df

In [None]:
d = pd.DataFrame({
    "sample_time": (sample_time * 1e9).astype('datetime64[ns]'), 
    "sample_time_s": sample_time, 
    "dt_gap": dt, 
})
d

Explore the now-gaps

In [None]:
gapped = (timens[tg_ind])
print(gapped)

Screenshots for pyglider issue

In [None]:
dt = np.concatenate(([0], np.diff(_t)))

df = pd.DataFrame({
    "sample_time": (_t * 1e9).astype('datetime64[ns]'), 
    "sensor_value": _, 
    "time_diff": dt})
df.iloc[54445:54460]

In [None]:
tg_ind = pgutils.find_gaps(_t, time, maxgap)

df_gap = pd.DataFrame({
    "timebase": (time * 1e9).astype('datetime64[ns]'), 
    "gap": index
})
df_gap[df_gap["timebase"] >= np.datetime64("2022-05-14 14:24:50")]

In [None]:
import pandas as pd
import numpy as np
import pyglider.utils as pgutils

# Create sample_time, and a slightly offset timebase
t0 = 1652486400 #number of seconds for "2022-05-14 00:00:00"
sample_time = np.arange(t0, t0+33, 4)
timebase = np.arange((t0+1), (t0+28), 3)
maxgap = 5

# Change one of the sample_time values to 0 (equivalent of 1970-01-01)
sample_time[4] = 0

# Run find_gaps - this output is incorrect
out1 = pgutils.find_gaps(sample_time, timebase, maxgap)

# Print as dataframe for display
df = pd.DataFrame(data={
    "sample_time": (sample_time * 1e9).astype('datetime64[ns]'), 
    "timebase": (timebase * 1e9).astype('datetime64[ns]'), 
    "find_gaps_out": out1
})
df