Merge pull request #18 from SECOORA/extras-as-profile-files

Move the "extras" data to be in their own profile netCDF file
SECOORA · Apr 26, 2022 · f32d479 · f32d479
2 parents ebbe59d + 49af4a0
commit f32d479
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 125 deletions.
diff --git a/gutils/filters.py b/gutils/filters.py
@@ -2,7 +2,6 @@
 # coding=utf-8
 import os
 import pandas as pd
-import numpy as np
 
 from gutils.yo import assign_profiles
 
@@ -177,23 +176,9 @@ def process_dataset(file,
                     tolerance=pd.Timedelta(minutes=10)
                 ).set_index(extras.index)
                 extras['profile'] = merge.profile.ffill()
-
-                # To have consistent netCDF files, empty "extras" variables need to exist
-                # in for each valid profile that was calculated above into "filtered".
-                profile_list = set(filtered['profile'].unique())
-                extras_list = set(extras['profile'].unique().astype('int32'))
-                profiles_to_add = profile_list.difference(extras_list)
-                if profiles_to_add:
-                    first_t_in_profiles = filtered.groupby(by=["profile"]).min()['t']
-                    for profile_to_add in profiles_to_add:
-                        empty_df = pd.DataFrame([[np.nan] * len(extras.columns)], columns=extras.columns)
-                        empty_df['profile'] = profile_to_add
-                        empty_df['pseudogram_time'] = first_t_in_profiles[profile_to_add]
-                        empty_df.set_index('pseudogram_time', inplace=True)
-                        extras = pd.concat([extras, empty_df], sort=True)
-
             except BaseException as e:
                 L.error(f"Could not merge 'extras' data, skipping: {e}")
+                extras = pd.DataFrame()
 
     except ValueError as e:
         L.exception('{} - Skipping'.format(e))

diff --git a/gutils/nc.py b/gutils/nc.py
@@ -14,6 +14,7 @@
 from datetime import datetime
 from collections import OrderedDict
 
+import numpy as np
 import pandas as pd
 import netCDF4 as nc4
 from compliance_checker.runner import ComplianceChecker, CheckSuite
@@ -191,11 +192,7 @@ def get_creation_attributes(profile):
     }
 
 
-def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH,
-                          extras_df=None):
-
-    if extras_df is None:
-        extras_df = pd.DataFrame()
+def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH):
 
     try:
         # Path to hold file while we create it
@@ -281,9 +278,6 @@ def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=Pro
                 reduce_dims=True,
                 mode='a') as ncd:
 
-            # Set an extras data
-            set_extra_data(ncd, extras_df)
-
             # We only want to apply metadata from the `attrs` map if the variable is already in
             # the netCDF file or it is a scalar variable (no shape defined). This avoids
             # creating measured variables that were not measured in this profile.
@@ -359,21 +353,22 @@ def create_netcdf(attrs, data, output_path, mode, profile_id_type=ProfileIdTypes
     # Create NetCDF Files for Each Profile
     written_files = []
 
-    for df in [data, extras_df]:
+    reserved_columns = [
+        'trajectory',
+        'profile',
+        't',
+        'x',
+        'y',
+        'z',
+        'u_orig',
+        'v_orig'
+    ]
 
+    for df in [data, extras_df]:
         # Optionally, remove any variables from the dataframe that do not have metadata assigned
         if subset is True:
             all_columns = set(df.columns)
-            reserved_columns = [
-                'trajectory',
-                'profile',
-                't',
-                'x',
-                'y',
-                'z',
-                'u_orig',
-                'v_orig'
-            ]
+
             removable_columns = all_columns - set(reserved_columns)
             orphans = removable_columns - set(attrs.get('variables', {}).keys())
             L.debug(
@@ -393,11 +388,39 @@ def create_netcdf(attrs, data, output_path, mode, profile_id_type=ProfileIdTypes
 
         profile_extras = pd.DataFrame()
         if not extras_df.empty:
-            profile_extras = extras_df.loc[extras_df.profile == pi]
+
+            # Write the extras dimension to a new profile file
+            profile_extras = extras_df.loc[extras_df.profile == pi].copy()
+            if profile_extras.empty:
+                continue
+
+            # Standardize the columns of the "extras" from the matched profile
+            profile_extras.loc[:, 't'] = profile_extras.index
+            profile_extras = profile_extras.reset_index(drop=True)
+            profile_extras.loc[:, 'x'] = profile.x.dropna().iloc[0]
+            profile_extras.loc[:, 'y'] = profile.y.dropna().iloc[0]
+
+            # Fill in extras with empty data
+            for c in profile:
+                if c not in profile_extras:
+                    profile_extras.loc[:, c] = np.nan
+                    profile_extras.loc[:, c] = profile_extras[c].astype(profile[c].dtype)
+
+            # Fill in regular profile with empty data
+            for c in profile_extras:
+                if c not in profile:
+                    profile.loc[:, c] = np.nan
+                    profile.loc[:, c] = profile[c].astype(profile_extras[c].dtype)
+
+            try:
+                cr = create_profile_netcdf(attrs, profile_extras, output_path, mode, profile_id_type)
+                written.append(cr)
+            except BaseException:
+                L.exception('Error creating extra netCDF profile {}. Skipping.'.format(pi))
+                continue
 
         try:
-            cr = create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type,
-                                       extras_df=profile_extras)
+            cr = create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type)
             written.append(cr)
         except BaseException:
             L.exception('Error creating netCDF for profile {}. Skipping.'.format(pi))

diff --git a/gutils/slocum/__init__.py b/gutils/slocum/__init__.py
@@ -76,14 +76,28 @@ def extras(self, data, **kwargs):
             using an extras time dimension.
         """
 
-        ECOMETRICS_SENSORS = [ 'sci_echodroid_aggindex', 'sci_echodroid_ctrmass', 'sci_echodroid_eqarea', 'sci_echodroid_inertia', 'sci_echodroid_propocc', 'sci_echodroid_sa', 'sci_echodroid_sv']
-        PSEUDOGRAM_VARS = ['pseudogram_time', 'pseudogram_depth', 'pseudogram_sv']
+        ECOMETRICS_SENSORS = [
+            'sci_echodroid_aggindex',
+            'sci_echodroid_ctrmass',
+            'sci_echodroid_eqarea',
+            'sci_echodroid_inertia',
+            'sci_echodroid_propocc',
+            'sci_echodroid_sa',
+            'sci_echodroid_sv',
+        ]
+
+        PSEUDOGRAM_VARS = [
+            'pseudogram_time',
+            'pseudogram_depth',
+            'pseudogram_sv',
+        ]
 
         # Default extra settings
         pseudograms_attrs = kwargs.get('pseudograms', {})
-        enable_pseudograms = pseudograms_attrs.get('enable', False)
+        enable_nc = pseudograms_attrs.get('enable_nc', False)
+        enable_ascii = pseudograms_attrs.get('enable_ascii', False)
 
-        if enable_pseudograms:
+        if enable_nc and enable_ascii:
 
             # Two possible outcomes:
             #     (1) If the pseudogram exists, align ecometrics data along
@@ -123,13 +137,18 @@ def extras(self, data, **kwargs):
             # Create ECOMETRICS variable placeholders
             if have_pseudogram:
                 # ecometrics data is inserted into time data as provided by the pseudogram
+                size = len(self._extras['pseudogram_time'])
                 for sensor in ECOMETRICS_SENSORS:
-                    self._extras[sensor] = np.full((len(self._extras['pseudogram_time'])), np.nan)
+                    self._extras[sensor] = np.full(size, np.nan)
+
             else:
                 # with a missing pseudogram, we can use a shorter list of times
                 # we have to create placeholders for PSEUDOGRAM and ECOMETRICS variables
-                for sensor in PSEUDOGRAM_VARS + ECOMETRICS_SENSORS:
-                    self._extras[sensor] = np.full((len(ecometricsData)), np.nan)
+                size = len(ecometricsData)
+                for sensor in ECOMETRICS_SENSORS:
+                    self._extras[sensor] = np.full(size, np.nan)
+                for sensor in PSEUDOGRAM_VARS:
+                    self._extras[sensor] = np.full(size, np.nan)
 
             if have_pseudogram:
                 for _, row in ecometricsData.iterrows():
@@ -172,15 +191,24 @@ def extras(self, data, **kwargs):
                 self._extras.pseudogram_time, unit='s', origin='unix'
             )
 
-            if have_pseudogram:
-                self._extras = self._extras.sort_values([
-                    'pseudogram_time',
-                    'pseudogram_depth'
-                ])
-            else:
-                self._extras = self._extras.sort_values(['pseudogram_time'])
-
-            self._extras.set_index("pseudogram_time", inplace=True)
+            if not self._extras['pseudogram_depth'].any():
+                self._extras.loc[:, 'pseudogram_depth'] = 0.0
+
+            self._extras = self._extras.sort_values([
+                'pseudogram_time',
+                'pseudogram_depth'
+            ])
+
+            # Return a "standardized" dataframe with "t" as the index
+            # and a column named "z".
+            self._extras.rename(
+                columns={
+                    "pseudogram_time": "t",
+                    "pseudogram_depth": "z"
+                },
+                inplace=True
+            )
+            self._extras.set_index("t", inplace=True)
 
         return self._extras, data
 
@@ -518,8 +546,8 @@ def convert(self):
         ]
 
         pseudograms_attrs = self.extra_kwargs.get('pseudograms', {})
-        have_pseudograms = pseudograms_attrs.get('enable', False)
-        if have_pseudograms:
+        enable_ascii = pseudograms_attrs.get('enable_ascii', False)
+        if enable_ascii:
             # Perform pseudograms if this ASCII file matches the deployment
             # name of things we know to have the data. There needs to be a
             # better way to figure this out, but we don't have any understanding
@@ -534,25 +562,19 @@ def convert(self):
             # https://github.com/smerckel/dbdreader
 
             # Defaults
-            create_images = pseudograms_attrs.get('create_images', False)
+            enable_image = pseudograms_attrs.get('enable_image', False)
             echosounderRange = pseudograms_attrs.get('echosounderRange', 60.0)
             echosounderDirection = pseudograms_attrs.get('echosounderDirection', 'down')
             if echosounderDirection == 'up':
                 echosounderRange = - (echosounderRange)
 
-            if create_images:
-                pargs = pargs + [
-                    '-y', sys.executable,
-                    '-g',  # Makes the pseudogram ASCII
-                    '-i',  # Makes the pseudogram images. This is slow!
-                    '-r', f"{echosounderRange}"
-                ]
-            else:
-                pargs = pargs + [
-                    '-y', sys.executable,
-                    '-g',  # Makes the pseudogram ASCII
-                    '-r', f"{echosounderRange}"
-                ]
+            pargs = pargs + [
+                '-y', sys.executable,
+                '-g',  # Makes the pseudogram ASCII
+                '-r', f"{echosounderRange}"
+            ]
+            if enable_image:
+                pargs.append('-i')  # Makes the pseudogram images. This is slow!
 
         pargs.append(self.tmpdir)
         pargs.append(self.destination_directory)

diff --git a/gutils/templates/slocum_dac.json b/gutils/templates/slocum_dac.json
@@ -1005,32 +1005,8 @@
         "_FillValue": {"type": "float", "data": -9999.9}
       }
     },
-    "pseudogram_time": {
-      "type": "double",
-      "attributes": {
-        "long_name": "Pseudogram Time",
-        "ioos_category": "Other",
-        "standard_name": "pseudogram_time",
-        "platform": "platform",
-        "observation_type": "measured",
-        "_FillValue": {"type": "double", "data": -1}
-      }
-    },
-    "pseudogram_depth": {
-      "type": "double",
-      "attributes": {
-        "units": "m",
-        "long_name": "Pseudogram Depth",
-        "valid_min": 0.0,
-        "valid_max": 2000.0,
-        "ioos_category": "Other",
-        "standard_name": "pseudogram_depth",
-        "platform": "platform",
-        "observation_type": "measured",
-        "_FillValue": {"type": "double", "data": -9999.9}
-      }
-    },
     "pseudogram_sv": {
+      "shape": ["time"],
       "type": "double",
       "attributes": {
         "units": "db",
@@ -1041,7 +1017,6 @@
         "standard_name": "pseudogram_sv",
         "platform": "platform",
         "observation_type": "measured",
-        "coordinates": "pseudogram_time pseudogram_depth",
         "_FillValue": {"type": "double", "data": -9999.9}
       }
     }

diff --git a/gutils/tests/resources/slocum/ecometrics/config/deployment.json b/gutils/tests/resources/slocum/ecometrics/config/deployment.json
@@ -1,6 +1,16 @@
 {
     "glider": "ecometrics",
     "trajectory_date": "20220212T0000",
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": false,
+            "enable_ascii": false,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "up",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",

diff --git a/gutils/tests/resources/slocum/ecometrics2/config/deployment.json b/gutils/tests/resources/slocum/ecometrics2/config/deployment.json
@@ -1,6 +1,16 @@
 {
     "glider": "ecometrics",
     "trajectory_date": "20220212T0000",
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": true,
+            "enable_ascii": true,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "down",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
@@ -36,15 +46,6 @@
         "title": "G507 Slocum Glider Dataset (Feb 2022)",
         "wmo_id": 4802989
     },
-    "extra_kwargs": {
-        "pseudograms": {
-            "enable": true,
-            "create_images": false,
-            "echosounderRange": 60.0,
-            "echosounderDirection": "down",
-            "echosounderRangeUnits": "meters"
-        }
-    },
     "variables": {
         "platform": {
             "attributes": {

diff --git a/gutils/tests/resources/slocum/ecometrics3/config/deployment.json b/gutils/tests/resources/slocum/ecometrics3/config/deployment.json
@@ -8,6 +8,16 @@
         "filter_points": 5,
         "filter_distance": 1
     },
+    "extra_kwargs": {
+        "pseudograms": {
+            "enable_nc": false,
+            "enable_ascii": false,
+            "enable_image": false,
+            "echosounderRange": 60.0,
+            "echosounderDirection": "up",
+            "echosounderRangeUnits": "meters"
+        }
+    },
     "attributes": {
         "acknowledgement": "This work was supported by funding from NOAA/IOOS/AOOS.",
         "comment": "",
@@ -43,15 +53,6 @@
         "title": "G507 Slocum Glider Dataset (Feb 2022)",
         "wmo_id": 4802989
     },
-    "extra_kwargs": {
-        "pseudograms": {
-            "enable": false,
-            "create_images": true,
-            "echosounderRange": 60.0,
-            "echosounderDirection": "up",
-            "echosounderRangeUnits": "meters"
-        }
-    },
     "variables": {
         "platform": {
             "attributes": {