From dbcdd18d65d5498a2fdc065616e0585803c5a895 Mon Sep 17 00:00:00 2001
From: Kyle Wilcox <kyle@axiomdatascience.com>
Date: Mon, 25 Apr 2022 16:58:10 -0400
Subject: [PATCH 1/3] Move the "extras" data to be in their own profile netCDF
 file

This assumes that the "extras" data occurs before or after a profile
and for now I think that is OK. This reduces the complexity of the
code quite a bit. We just make the "extras" data look like a profile
dataset, optionally setting the depths to zero if there are no depths
to capture the data as surface measurements.

This also changes some of the extras configuration kwarg names so we
can control the creation of the ASCII pseudogram and the inclustion
of that data into the netCDF profile files, and the image creation.
---
 gutils/filters.py                             | 17 +---
 gutils/nc.py                                  | 67 ++++++++++-----
 gutils/slocum/__init__.py                     | 86 ++++++++++++-------
 gutils/templates/slocum_dac.json              | 27 +-----
 .../slocum/ecometrics/config/deployment.json  | 10 +++
 .../slocum/ecometrics2/config/deployment.json | 19 ++--
 .../slocum/ecometrics3/config/deployment.json | 19 ++--
 .../slocum/ecometrics4/config/deployment.json | 19 ++--
 8 files changed, 141 insertions(+), 123 deletions(-)

diff --git a/gutils/filters.py b/gutils/filters.py
index 3c16343..386bdb8 100644
--- a/gutils/filters.py
+++ b/gutils/filters.py
@@ -2,7 +2,6 @@
 # coding=utf-8
 import os
 import pandas as pd
-import numpy as np
 
 from gutils.yo import assign_profiles
 
@@ -177,23 +176,9 @@ def process_dataset(file,
                     tolerance=pd.Timedelta(minutes=10)
                 ).set_index(extras.index)
                 extras['profile'] = merge.profile.ffill()
-
-                # To have consistent netCDF files, empty "extras" variables need to exist
-                # in for each valid profile that was calculated above into "filtered".
-                profile_list = set(filtered['profile'].unique())
-                extras_list = set(extras['profile'].unique().astype('int32'))
-                profiles_to_add = profile_list.difference(extras_list)
-                if profiles_to_add:
-                    first_t_in_profiles = filtered.groupby(by=["profile"]).min()['t']
-                    for profile_to_add in profiles_to_add:
-                        empty_df = pd.DataFrame([[np.nan] * len(extras.columns)], columns=extras.columns)
-                        empty_df['profile'] = profile_to_add
-                        empty_df['pseudogram_time'] = first_t_in_profiles[profile_to_add]
-                        empty_df.set_index('pseudogram_time', inplace=True)
-                        extras = pd.concat([extras, empty_df], sort=True)
-
             except BaseException as e:
                 L.error(f"Could not merge 'extras' data, skipping: {e}")
+                extras = pd.DataFrame()
 
     except ValueError as e:
         L.exception('{} - Skipping'.format(e))
diff --git a/gutils/nc.py b/gutils/nc.py
index 1f33c67..eae90c4 100644
--- a/gutils/nc.py
+++ b/gutils/nc.py
@@ -14,6 +14,7 @@
 from datetime import datetime
 from collections import OrderedDict
 
+import numpy as np
 import pandas as pd
 import netCDF4 as nc4
 from compliance_checker.runner import ComplianceChecker, CheckSuite
@@ -191,11 +192,7 @@ def get_creation_attributes(profile):
     }
 
 
-def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH,
-                          extras_df=None):
-
-    if extras_df is None:
-        extras_df = pd.DataFrame()
+def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH):
 
     try:
         # Path to hold file while we create it
@@ -281,9 +278,6 @@ def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=Pro
                 reduce_dims=True,
                 mode='a') as ncd:
 
-            # Set an extras data
-            set_extra_data(ncd, extras_df)
-
             # We only want to apply metadata from the `attrs` map if the variable is already in
             # the netCDF file or it is a scalar variable (no shape defined). This avoids
             # creating measured variables that were not measured in this profile.
@@ -359,21 +353,22 @@ def create_netcdf(attrs, data, output_path, mode, profile_id_type=ProfileIdTypes
     # Create NetCDF Files for Each Profile
     written_files = []
 
-    for df in [data, extras_df]:
+    reserved_columns = [
+        'trajectory',
+        'profile',
+        't',
+        'x',
+        'y',
+        'z',
+        'u_orig',
+        'v_orig'
+    ]
 
+    for df in [data, extras_df]:
         # Optionally, remove any variables from the dataframe that do not have metadata assigned
         if subset is True:
             all_columns = set(df.columns)
-            reserved_columns = [
-                'trajectory',
-                'profile',
-                't',
-                'x',
-                'y',
-                'z',
-                'u_orig',
-                'v_orig'
-            ]
+
             removable_columns = all_columns - set(reserved_columns)
             orphans = removable_columns - set(attrs.get('variables', {}).keys())
             L.debug(
@@ -393,11 +388,39 @@ def create_netcdf(attrs, data, output_path, mode, profile_id_type=ProfileIdTypes
 
         profile_extras = pd.DataFrame()
         if not extras_df.empty:
-            profile_extras = extras_df.loc[extras_df.profile == pi]
+
+            # Write the extras dimension to a new profile file
+            profile_extras = extras_df.loc[extras_df.profile == pi].copy()
+            if profile_extras.empty:
+                continue
+
+            # Standardize the columns of the "extras" from the matched profile
+            profile_extras.loc[:, 't'] = profile_extras.index
+            profile_extras = profile_extras.reset_index(drop=True)
+            profile_extras.loc[:, 'x'] = profile.x.dropna().iloc[0]
+            profile_extras.loc[:, 'y'] = profile.y.dropna().iloc[0]
+
+            # Fill in extras with empty data
+            for c in profile:
+                if c not in profile_extras:
+                    profile_extras.loc[:, c] = np.nan
+                    profile_extras.loc[:, c] = profile_extras[c].astype(profile[c].dtype)
+
+            # Fill in regular profile with empty data
+            for c in profile_extras:
+                if c not in profile:
+                    profile.loc[:, c] = np.nan
+                    profile.loc[:, c] = profile[c].astype(profile_extras[c].dtype)
+
+            try:
+                cr = create_profile_netcdf(attrs, profile_extras, output_path, mode, profile_id_type)
+                written.append(cr)
+            except BaseException:
+                L.exception('Error creating extra netCDF profile {}. Skipping.'.format(pi))
+                continue
 
         try:
-            cr = create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type,
-                                       extras_df=profile_extras)
+            cr = create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type)
             written.append(cr)
         except BaseException:
             L.exception('Error creating netCDF for profile {}. Skipping.'.format(pi))
diff --git a/gutils/slocum/__init__.py b/gutils/slocum/__init__.py
index 8371988..8501f14 100644
--- a/gutils/slocum/__init__.py
+++ b/gutils/slocum/__init__.py
@@ -76,14 +76,28 @@ def extras(self, data, **kwargs):
             using an extras time dimension.
         """
 
-        ECOMETRICS_SENSORS = [ 'sci_echodroid_aggindex', 'sci_echodroid_ctrmass', 'sci_echodroid_eqarea', 'sci_echodroid_inertia', 'sci_echodroid_propocc', 'sci_echodroid_sa', 'sci_echodroid_sv']
-        PSEUDOGRAM_VARS = ['pseudogram_time', 'pseudogram_depth', 'pseudogram_sv']
+        ECOMETRICS_SENSORS = [
+            'sci_echodroid_aggindex',
+            'sci_echodroid_ctrmass',
+            'sci_echodroid_eqarea',
+            'sci_echodroid_inertia',
+            'sci_echodroid_propocc',
+            'sci_echodroid_sa',
+            'sci_echodroid_sv',
+        ]
+
+        PSEUDOGRAM_VARS = [
+            'pseudogram_time',
+            'pseudogram_depth',
+            'pseudogram_sv',
+        ]
 
         # Default extra settings
         pseudograms_attrs = kwargs.get('pseudograms', {})
-        enable_pseudograms = pseudograms_attrs.get('enable', False)
+        enable_nc = pseudograms_attrs.get('enable_nc', False)
+        enable_ascii = pseudograms_attrs.get('enable_ascii', False)
 
-        if enable_pseudograms:
+        if enable_nc and enable_ascii:
 
             # Two possible outcomes:
             #     (1) If the pseudogram exists, align ecometrics data along
@@ -123,13 +137,18 @@ def extras(self, data, **kwargs):
             # Create ECOMETRICS variable placeholders
             if have_pseudogram:
                 # ecometrics data is inserted into time data as provided by the pseudogram
+                size = len(self._extras['pseudogram_time'])
                 for sensor in ECOMETRICS_SENSORS:
-                    self._extras[sensor] = np.full((len(self._extras['pseudogram_time'])), np.nan)
+                    self._extras[sensor] = np.full(size, np.nan)
+
             else:
                 # with a missing pseudogram, we can use a shorter list of times
                 # we have to create placeholders for PSEUDOGRAM and ECOMETRICS variables
-                for sensor in PSEUDOGRAM_VARS + ECOMETRICS_SENSORS:
-                    self._extras[sensor] = np.full((len(ecometricsData)), np.nan)
+                size = len(ecometricsData)
+                for sensor in ECOMETRICS_SENSORS:
+                    self._extras[sensor] = np.full(size, np.nan)
+                for sensor in PSEUDOGRAM_VARS:
+                    self._extras[sensor] = np.full(size, np.nan)
 
             if have_pseudogram:
                 for _, row in ecometricsData.iterrows():
@@ -172,15 +191,24 @@ def extras(self, data, **kwargs):
                 self._extras.pseudogram_time, unit='s', origin='unix'
             )
 
-            if have_pseudogram:
-                self._extras = self._extras.sort_values([
-                    'pseudogram_time',
-                    'pseudogram_depth'
-                ])
-            else:
-                self._extras = self._extras.sort_values(['pseudogram_time'])
-
-            self._extras.set_index("pseudogram_time", inplace=True)
+            if not self._extras['pseudogram_depth'].any():
+                self._extras.loc[:, 'pseudogram_depth'] = 0.0
+
+            self._extras = self._extras.sort_values([
+                'pseudogram_time',
+                'pseudogram_depth'
+            ])
+
+            # Return a "standardized" dataframe with "t" as the index
+            # and a column named "z".
+            self._extras.rename(
+                columns={
+                    "pseudogram_time": "t",
+                    "pseudogram_depth": "z"
+                },
+                inplace=True
+            )
+            self._extras.set_index("t", inplace=True)
 
         return self._extras, data
 
@@ -518,8 +546,8 @@ def convert(self):
         ]
 
         pseudograms_attrs = self.extra_kwargs.get('pseudograms', {})
-        have_pseudograms = pseudograms_attrs.get('enable', False)
-        if have_pseudograms:
+        enable_ascii = pseudograms_attrs.get('enable_ascii', False)
+        if enable_ascii:
             # Perform pseudograms if this ASCII file matches the deployment
             # name of things we know to have the data. There needs to be a
             # better way to figure this out, but we don't have any understanding
@@ -534,25 +562,19 @@ def convert(self):
             # https://github.com/smerckel/dbdreader
 
             # Defaults
-            create_images = pseudograms_attrs.get('create_images', False)
+            enable_image = pseudograms_attrs.get('enable_image', False)
             echosounderRange = pseudograms_attrs.get('echosounderRange', 60.0)
             echosounderDirection = pseudograms_attrs.get('echosounderDirection', 'down')
             if echosounderDirection == 'up':
                 echosounderRange = - (echosounderRange)
 
-            if create_images:
-                pargs = pargs + [
-                    '-y', sys.executable,
-                    '-g',  # Makes the pseudogram ASCII
-                    '-i',  # Makes the pseudogram images. This is slow!
-                    '-r', f"{echosounderRange}"
-                ]
-            else:
-                pargs = pargs + [
-                    '-y', sys.executable,
-                    '-g',  # Makes the pseudogram ASCII
-                    '-r', f"{echosounderRange}"
-                ]
+            pargs = pargs + [
+                '-y', sys.executable,
+                '-g',  # Makes the pseudogram ASCII
+                '-r', f"{echosounderRange}"
+            ]
+            if enable_image:
+                pargs.append('-i')  # Makes the pseudogram images. This is slow!
 
         pargs.append(self.tmpdir)
         pargs.append(self.destination_directory)
diff --git a/gutils/templates/slocum_dac.json b/gutils/templates/slocum_dac.json
index aaca414..8d9dedd 100644
--- a/gutils/templates/slocum_dac.json
+++ b/gutils/templates/slocum_dac.json
@@ -1005,32 +1005,8 @@
         "_FillValue": {"type": "float", "data": -9999.9}
       }
     },
-    "pseudogram_time": {
-      "type": "double",
-      "attributes": {
-        "long_name": "Pseudogram Time",
-        "ioos_category": "Other",
-        "standard_name": "pseudogram_time",
-        "platform": "platform",
-        "observation_type": "measured",
-        "_FillValue": {"type": "double", "data": -1}
-      }
-    },
-    "pseudogram_depth": {
-      "type": "double",
-      "attributes": {
-        "units": "m",
-        "long_name": "Pseudogram Depth",
-        "valid_min": 0.0,
-        "valid_max": 2000.0,
-        "ioos_category": "Other",
-        "standard_name": "pseudogram_depth",
-        "platform": "platform",
-        "observation_type": "measured",
-        "_FillValue": {"type": "double", "data": -9999.9}
-      }
-    },
     "pseudogram_sv": {
+      "shape": ["time"],
       "type": "double",
       "attributes": {
         "units": "db",
@@ -1041,7 +1017,6 @@
         "standard_name": "pseudogram_sv",
         "platform": "platform",
         "observation_type": "measured",
-        "coordinates": "pseudogram_time pseudogram_depth",
         "_FillValue": {"type": "double", "data": -9999.9}
       }
     }
diff --git a/gutils/tests/resources/slocum/ecometrics/config/deployment.json b/gutils/tests/resources/slocum/ecometrics/config/deployment.json
index 2b45675..40ab93e 100644
--- a/gutils/tests/resources/slocum/ecometrics/config/deployment.json
+++ b/gutils/tests/resources/slocum/ecometrics/config/deployment.json
@@ -1,6 +1,16 @@
 {
     "glider": "ecometrics",
     "trajectory_date": "20220212T0000",
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": false,
+            "enable_ascii": false,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "up",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
diff --git a/gutils/tests/resources/slocum/ecometrics2/config/deployment.json b/gutils/tests/resources/slocum/ecometrics2/config/deployment.json
index 5d2baab..5797229 100644
--- a/gutils/tests/resources/slocum/ecometrics2/config/deployment.json
+++ b/gutils/tests/resources/slocum/ecometrics2/config/deployment.json
@@ -1,6 +1,16 @@
 {
     "glider": "ecometrics",
     "trajectory_date": "20220212T0000",
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": true,
+            "enable_ascii": true,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "down",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
@@ -36,15 +46,6 @@
         "title": "G507 Slocum Glider Dataset (Feb 2022)",
         "wmo_id": 4802989
     },
-    "extra_kwargs": {
-        "pseudograms": {
-            "enable": true,
-            "create_images": false,
-            "echosounderRange": 60.0,
-            "echosounderDirection": "down",
-            "echosounderRangeUnits": "meters"
-        }
-    },
     "variables": {
         "platform": {
             "attributes": {
diff --git a/gutils/tests/resources/slocum/ecometrics3/config/deployment.json b/gutils/tests/resources/slocum/ecometrics3/config/deployment.json
index 2a15c89..17354aa 100644
--- a/gutils/tests/resources/slocum/ecometrics3/config/deployment.json
+++ b/gutils/tests/resources/slocum/ecometrics3/config/deployment.json
@@ -8,6 +8,16 @@
         "filter_points": 5,
         "filter_distance": 1
     },
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": false,
+            "enable_ascii": false,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "up",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
@@ -43,15 +53,6 @@
         "title": "G507 Slocum Glider Dataset (Feb 2022)",
         "wmo_id": 4802989
     },
-    "extra_kwargs": {
-        "pseudograms": {
-            "enable": false,
-            "create_images": true,
-            "echosounderRange": 60.0,
-            "echosounderDirection": "up",
-            "echosounderRangeUnits": "meters"
-        }
-    },
     "variables": {
         "platform": {
             "attributes": {
diff --git a/gutils/tests/resources/slocum/ecometrics4/config/deployment.json b/gutils/tests/resources/slocum/ecometrics4/config/deployment.json
index b6df20d..98d4e41 100644
--- a/gutils/tests/resources/slocum/ecometrics4/config/deployment.json
+++ b/gutils/tests/resources/slocum/ecometrics4/config/deployment.json
@@ -8,6 +8,16 @@
         "filter_points": 5,
         "filter_distance": 1
     },
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": false,
+            "enable_ascii": true,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "up",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
@@ -43,15 +53,6 @@
         "title": "G507 Slocum Glider Dataset (Feb 2022)",
         "wmo_id": 4802989
     },
-    "extra_kwargs": {
-        "pseudograms": {
-            "enable": true,
-            "create_images": false,
-            "echosounderRange": 60.0,
-            "echosounderDirection": "up",
-            "echosounderRangeUnits": "meters"
-        }
-    },
     "variables": {
         "platform": {
             "attributes": {

From 18268cc16d4c3826a689e9d0762ca81baa408706 Mon Sep 17 00:00:00 2001
From: Kyle Wilcox <kyle@axiomdatascience.com>
Date: Mon, 25 Apr 2022 21:57:01 -0400
Subject: [PATCH 2/3] Fix pseodogram test

---
 gutils/tests/test_slocum.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gutils/tests/test_slocum.py b/gutils/tests/test_slocum.py
index 2c80648..e754a83 100644
--- a/gutils/tests/test_slocum.py
+++ b/gutils/tests/test_slocum.py
@@ -393,17 +393,19 @@ def test_pseudogram(self):
 
         output_files = sorted(os.listdir(self.netcdf_path))
         output_files = [ os.path.join(self.netcdf_path, o) for o in output_files ]
-        assert len(output_files) == 17
+        assert len(output_files) == 33
 
         # First profile
         with nc4.Dataset(output_files[0]) as ncd:
             assert ncd.variables['profile_id'].ndim == 0
+            # first time in the first profile
             assert ncd.variables['profile_id'][0] == 1639020410
 
         # Last profile
         with nc4.Dataset(output_files[-1]) as ncd:
             assert ncd.variables['profile_id'].ndim == 0
-            assert ncd.variables['profile_id'][0] == 1639069272
+            # first time in the last ecodroid profile
+            assert ncd.variables['profile_id'][0] == 1639070632
 
         # Check netCDF file for compliance
         ds = namedtuple('Arguments', ['file'])

From 49af4a01f6f1ed7b13768873accf1c683c51f3b1 Mon Sep 17 00:00:00 2001
From: Kyle Wilcox <kyle@axiomdatascience.com>
Date: Mon, 25 Apr 2022 22:26:40 -0400
Subject: [PATCH 3/3] Counting is not my thing tonight

---
 gutils/tests/test_slocum.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gutils/tests/test_slocum.py b/gutils/tests/test_slocum.py
index e754a83..742b0e2 100644
--- a/gutils/tests/test_slocum.py
+++ b/gutils/tests/test_slocum.py
@@ -393,7 +393,7 @@ def test_pseudogram(self):
 
         output_files = sorted(os.listdir(self.netcdf_path))
         output_files = [ os.path.join(self.netcdf_path, o) for o in output_files ]
-        assert len(output_files) == 33
+        assert len(output_files) == 32
 
         # First profile
         with nc4.Dataset(output_files[0]) as ncd: