# 2 - Process and Merge Datasets
Next, we want to process the ADCP and Science Datasets so that we can merge them into a single unified dataset, derive the individual profile dives, and run the vendor-suggested Quality Control algorithms.

In [12]:
import os, sys, re, ast
import pandas as pd
import numpy as np
import xarray as xr

# For Dev, append the module to the working path
sys.path.append('/home/jovyan/WHOIGit/ooicgsn_glider_dvl/ooicgsn_glider_dvl/')

In [13]:
# For this example we will be using CP05MOAS-GL379 D00008, which was deployed at the Pioneer NES array from 2019-09-27 to 2019-11-19.
refdes = 'CP05MOAS-GL379-01-ADCPAM000'
eng_refdes = 'CP05MOAS-GL379-00-ENG000000'
deployment = 8

In [14]:
dep_str = f"{deployment:04d}"  # always 4 digits, e.g., '0008'
glider_name = '-'.join(refdes.split('-')[:2]) # shorten ref des to get glider name

In [None]:
# Load the ADCP Data
adcp = xr.open_dataset(f"../data/raw/{refdes}.deployment{dep_str}.adcp_data.nc").load()

# Load the Science Data
gdac = xr.open_dataset(f"../data/raw/{glider_name}.deployment{dep_str}.science.nc").load()

#### Merge the Datasets
Merging the glider ADCP and Science datasets is a multistep process. The Science dataset contains all of the different science sensors, along with the glider's built-in sensors and navigation, on a common timestamp. However, the different science sensors all sample at different rates. Thus, the science parameters in the dataset are sparse and filled with NaNs. In order to merge the Science and ADCP data streams, we need to do the following:
1. Split the separate science and glider parameters into different datasets based on their source sensor
2. Drop the NaNs from the each grouped science parameters set
3. Linearly interpolate each grouped parameters set to the ADCP time
4. Merge the interpolated, grouped parameter sets to the ADCP dataset

In [None]:
from merge import merge_datasets, split_data, sensor_variables

In [5]:
merged = merge_datasets(adcp, gdac)
merged

#### Idenfity Individual Profiles/Dives
The ADCP on OOI gliders is only active during the dive portion of the deployment. The algorithm to separate the profilers looks for where the absolute difference between two depths exceeds 2 meters and the sampling interval is greater than 4 seconds (the ADCP samples at 2 seconds).

In [6]:
# Next, identify the individual profiles
from profiles import add_profiles

In [7]:
merged = add_profiles(merged)
merged

### Vendor QA/QC 

First, utilize the TRDI Data QA-QC Model rev12-1 thresholds to derive a first-pass QC flag. We assume that the Explorer DVL 600kHz is comparable to the WH600kHz (Wide-Band) for setting the thresholds.
I think the config settings are: 2.0 meter depth cell size, 0.1 ping interval time, 10 pings per ensemble

| Parameter | Good | Suspect | Bad |
| --------- | ---- | ------- | --- |
| Error Velocity Threshold | <=6.3 cm/s | >6.3 cm/s, <=12.6 cm/s | >=12.6 cm/s |
| Correlation Magnitude | >=115 | <115, >=63 | <63 |
| Percent Good | >=63% | <63%, >=50% | <50% |
| BIT Errors | 0 | >=1 | N/A |

In [8]:
import qc

In [9]:
adcp['error_seawater_velocity'].min()

In [10]:
# Run the individual tests (make sure inputs and units match)
percent_good = qc.percent_good_qc(merged, 63, 50)
corr_mag = qc.correlation_magnitude_qc(merged, 115, 63)
error_vel = qc.error_velocity_qc(merged, 12.6/100, 6.3/100)

# Merge the test results
qc_flag = qc.merge_qc([percent_good, corr_mag, error_vel])

In [11]:
qc_flag

array([[3, 3, 3, ..., 4, 4, 4],
       [3, 3, 3, ..., 4, 4, 4],
       [3, 3, 4, ..., 4, 4, 4],
       ...,
       [4, 4, 4, ..., 4, 4, 4],
       [4, 4, 4, ..., 4, 4, 4],
       [4, 4, 4, ..., 4, 4, 4]], shape=(326719, 30))

In [12]:
# Now add the qc_flags to the 
merged['vendor_qc_flag'] = (['time','bin'], qc_flag)
merged['vendor_qc_flag'].attrs = {
            'long_name': 'TRDI QC Summary Flag',
            'standard_name': 'aggregate_quality_flag',
            'comment': ('Summary of the TRDI QC tests as a QARTOD style summary flag, where '
                        'the values are 1 == pass, 2 == not evaluated, 3 == suspect or of high interest, '
                        '4 == fail, and 9 == missing.'),
            'flag_values': np.array([1, 2, 3, 4, 9]).astype(np.int32),
            'flag_meanings': 'pass not_evaluated suspect_or_of_high_interest fail missing'
        }

#### Add Waypoints
The next step is to add the waypoint data into the merged dataset. The waypoint data is included in the glider engineering data and is not in either the science or adcp datasets. The waypoint data is necessary to calculate the net water-column velocity, which is acheived by comparing the expected waypoint with the actual surfacing location.

In [13]:
glider = pd.read_csv("../data/raw/deployment0001_CP15MOAS-GL388-00-ENG000000-recovered_host-glider_eng_recovered.csv")
glider.head()

Unnamed: 0,time,obs,c_air_pump,c_ballast_pumped,c_battpos,c_battroll,c_bsipar_on,c_de_oil_vol,c_dvl_on,c_flbbcd_on,...,m_water_vx,m_water_vy,m_why_started,m_x_lmc,m_y_lmc,port_timestamp,preferred_timestamp,x_last_wpt_lat,x_last_wpt_lon,x_system_clock_adjusted
0,2024-11-20 16:08:00.272670,"('0',)",1,260.0,0.7,0.0,-1.0,260.0,-1.0,-1.0,...,0.0,0.0,64,0.0,0.0,0.0,internal_timestamp,39.833332,-70.666664,0.0
1,2024-11-20 16:08:53.539060,"('1',)",1,260.0,0.7,0.0,-1.0,260.0,-1.0,-1.0,...,0.0,0.0,64,0.0,0.0,0.0,internal_timestamp,39.833332,-70.666664,0.0
2,2024-11-20 16:09:25.671230,"('2',)",1,,0.7,0.0,-1.0,260.0,-1.0,-1.0,...,,,-99,,,0.0,internal_timestamp,,,
3,2024-11-20 16:09:30.097960,"('3',)",1,,0.7,0.0,-1.0,260.0,-1.0,-1.0,...,,,-99,,,0.0,internal_timestamp,,,
4,2024-11-20 16:09:34.552520,"('4',)",1,,0.7,0.0,-1.0,260.0,-1.0,-1.0,...,,,-99,,,0.0,internal_timestamp,,,


In [20]:
waypoints['time'].map(pd.to_datetime)

0        2024-11-20 16:08:00.272670
11       2024-11-20 16:10:06.030000
12781    2024-11-22 10:49:07.920720
16065    2024-11-22 22:56:30.683040
18527    2024-11-23 07:07:12.111630
                    ...            
560207   2025-02-13 03:10:44.016880
563856   2025-02-13 16:42:01.682430
565184   2025-02-13 21:22:38.177090
569441   2025-02-14 14:00:59.678440
570933   2025-02-14 19:52:38.572600
Name: time, Length: 164, dtype: datetime64[ns]

In [19]:
waypoints['time']

0        2024-11-20 16:08:00.272670
11       2024-11-20 16:10:06.030000
12781    2024-11-22 10:49:07.920720
16065    2024-11-22 22:56:30.683040
18527    2024-11-23 07:07:12.111630
                    ...            
560207   2025-02-13 03:10:44.016880
563856   2025-02-13 16:42:01.682430
565184   2025-02-13 21:22:38.177090
569441   2025-02-14 14:00:59.678440
570933   2025-02-14 19:52:38.572600
Name: time, Length: 164, dtype: datetime64[ns]

In [14]:
# Get the waypoints
waypoints = glider[['time','c_wpt_lat','c_wpt_lon']].dropna()
waypoints['time'] = waypoints['time'].map(pd.to_datetime)

# Take the difference and find where the either the latitude or longitude change
d_wpt = waypoints.diff()
mask = (d_wpt['c_wpt_lat'] == 0) & (d_wpt['c_wpt_lon'] == 0)
waypoints = waypoints[~mask]

# Now add waypoints to the time base of the merged dataset
# Need to create empty arrays to hold the data
wpt_lat = np.zeros(merged['time'].shape)
wpt_lon = np.zeros(merged['time'].shape)
wpt_time = merged.time.values

# Iterate through the waypoints and find the appropriate times that they
# apply to
for n, (t, lat, lon) in enumerate(waypoints.itertuples(index=False)):
    if n == 0:
        # At the start of the deployment, only have first time
        idx, = np.where(wpt_time <= t)
        # Now add in the waypoint lat
        wpt_lat[idx] = lat
        wpt_lon[idx] = lon
    else:
        t0 = waypoints['time'].iloc[n-1]
        idx, = np.where((wpt_time > t0) & (wpt_time <= t))
        wpt_lat[idx] = lat
        wpt_lon[idx] = lon

In [15]:
# Add the waypoints to the merged dataset
merged['waypoint_lat'] = (['time'], wpt_lat)
merged['waypoint_lat'].attrs = {
    'long_name': 'Glider Waypoint Latitude',
    'comment': ('The target waypoint latitude for the glider.'),
    'units': 'degrees'
    }

merged['waypoint_lon'] = (['time'], wpt_lon)
merged['waypoint_lon'].attrs = {
    'long_name': 'Glider Waypoint Longitude',
    'comment': ('The target waypoint longitude for the glider.'),
    'units': 'degrees'
    }
    

In [21]:
# Save the results
merged.to_netcdf("../data/processed/CP15MOAS-GL388.deployment0001.merged.nc", format='netcdf4', engine='h5netcdf')