Bug fix for __read_csv() method to now read RISMA network data of "he…

…ader & files" format again (#67) * PEP8 and typing reformatted code according to PEP8 style guide and added missing type * changed __read_csv function instead of lambda function inside __read_csv * pep8 * updated __read_csv __read_csv had problems with reading faulty sensor files. This was now fixed by introducing **kwargs as additonal arguments * small correction and pep8 * fix: now all files of format 'header values' can be parsed correctly by pandas kept the **kwargs in the __read_csv method and added exception 'pd.erros.ParserError as text_exception' * preparation for sphinx started preparation for a sphinx documentation * deleted sphinx files * test * removed comments * added short file describing the original problem and the fix, added test data * minor revision * minor revision * minor revision * minor revision * minor revision * minor revision * new branch for USDA-ARS debugging * pep8 style check now done by yapf * smal adaptions * did the required changes for pull request * yapf pep8 * exchanged tuple[] with typing.Tuple[] * added line to CHANGELOG.rst * added line to CHANGELOG.rst to correct position this time * Removed the now ignored directory ".vscode" * Deleted "RISMA_test_data" directory
TUW-GEO · May 30, 2023 · bcb9c37 · bcb9c37
1 parent 08ce411
commit bcb9c37
Show file tree

Hide file tree

Showing 9 changed files with 113 additions and 81 deletions.
diff --git a/.gitignore b/.gitignore
@@ -53,4 +53,7 @@ MANIFEST
 **/python_metadata/*
 networks/*
 .artifacts/*
-.coverage*
+.coverage*
+
+# VSC config file
+.vscode/*
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,7 +5,7 @@ Changelog
 Unreleased changes in master branch
 ===================================
 
--
+- Fixed bug with parsing sensor files for RISMA network in "header & files" format
 
 Version 1.3.3
 =============

diff --git a/src/ismn/base.py b/src/ismn/base.py
@@ -127,10 +127,8 @@ def clean_subpath(self, subpath) -> Union[Path, PurePosixPath]:
             subpath = PurePosixPath(subpath)
         else:
             if not (self.path / Path(subpath)).exists():
-                raise ValueError(
-                    f"Subpath {subpath} does not exist"
-                    f" in archive {self.path}"
-                )
+                raise ValueError(f"Subpath {subpath} does not exist"
+                                 f" in archive {self.path}")
 
         return subpath
 

diff --git a/src/ismn/components.py b/src/ismn/components.py
@@ -686,8 +686,9 @@ def __repr__(self, indent: str = ""):
             for net in self.networks.values()
         ])
 
-    def __getitem__(self, item: Union[int, str, list]) -> \
-            Union["NetworkCollection", Network]:
+    def __getitem__(
+            self, item: Union[int, str,
+                              list]) -> Union["NetworkCollection", Network]:
         # shortcut to access networks directly
         if isinstance(item, (int, str)):
             if isinstance(item, int):

diff --git a/src/ismn/const.py b/src/ismn/const.py
@@ -108,16 +108,15 @@ class DepthError(ValueError):
     "quantity_source_name",
 ]
 
-lc_num_vars = ['lc_2000', 'lc_2005', 'lc_2010']
-lc_str_vars = ['lc_insitu']
+lc_num_vars = ["lc_2000", "lc_2005", "lc_2010"]
+lc_str_vars = ["lc_insitu"]
 LC_VARS = [*lc_num_vars, *lc_str_vars]
-CLIM_VARS = ['climate_KG', 'climate_insitu']
+CLIM_VARS = ["climate_KG", "climate_insitu"]
 
-CSV_META_TEMPLATE_SURF_VAR = OrderedDict(
-    [(c, np.nan) for c in lc_num_vars] +
-    [(c, 'unknown') for c in lc_str_vars] +
-    [(c, 'unknown') for c in CLIM_VARS]
-)
+CSV_META_TEMPLATE_SURF_VAR = OrderedDict([(c, np.nan) for c in lc_num_vars] +
+                                         [(c, "unknown")
+                                          for c in lc_str_vars] +
+                                         [(c, "unknown") for c in CLIM_VARS])
 CSV_META_TEMPLATE_GROUND_VAR = OrderedDict([
     ("saturation", np.nan),
     ("clay_fraction", np.nan),

diff --git a/src/ismn/custom.py b/src/ismn/custom.py
@@ -82,21 +82,21 @@ class CustomStationMetadataCsv(CustomMetaReader):
 
     def __init__(self, station_meta_csv, fill_values=None, **kwargs):
         """
-        Parameters
-        ----------
-        station_meta_csv: str
-            Path to the csv file with the above described content
-       fill_values: dict, optional (default: None)
-            Values to use for a certain custom metadata variable, if no
-            match is found.
-        kwargs:
-            Additional kwargs as passed to :func:`pandas.read_csv`
-            To use a different separator than the default semicolon, use `sep`
+         Parameters
+         ----------
+         station_meta_csv: str
+             Path to the csv file with the above described content
+        fill_values: dict, optional (default: None)
+             Values to use for a certain custom metadata variable, if no
+             match is found.
+         kwargs:
+             Additional kwargs as passed to :func:`pandas.read_csv`
+             To use a different separator than the default semicolon, use `sep`
         """
-        if 'sep' in kwargs:
-            sep = kwargs.pop('sep')
+        if "sep" in kwargs:
+            sep = kwargs.pop("sep")
         else:
-            sep = ';'
+            sep = ";"
 
         self.fill_values = dict() if fill_values is None else fill_values
         self.df = pd.read_csv(station_meta_csv, sep=sep, **kwargs)
@@ -108,9 +108,8 @@ def _empty_var(self, varnames) -> list:
         """
         vars = []
         for var in varnames:
-            if var in self.fill_values.keys() and \
-                not (var.endswith('_depth_from') or
-                     var.endswith('_depth_to')):
+            if var in self.fill_values.keys() and not (
+                    var.endswith("_depth_from") or var.endswith("_depth_to")):
                 vars.append(MetaVar(var, self.fill_values[var]))
         return vars
 
@@ -122,15 +121,15 @@ def _row2var(row: dict) -> list:
         vars = []
 
         for k, v in row.items():
-            if k.endswith('_depth_from') or k.endswith('_depth_to'):
+            if k.endswith("_depth_from") or k.endswith("_depth_to"):
                 continue
 
-            if f'{k}_depth_from' in row:
-                depth_from = row[f'{k}_depth_from']
+            if f"{k}_depth_from" in row:
+                depth_from = row[f"{k}_depth_from"]
             else:
                 depth_from = None
-            if f'{k}_depth_to' in row:
-                depth_to = row[f'{k}_depth_to']
+            if f"{k}_depth_to" in row:
+                depth_to = row[f"{k}_depth_to"]
             else:
                 depth_to = None
 
@@ -166,20 +165,20 @@ def read_metadata(self, meta: MetaData):
 
         """
 
-        cond = (self.df['network'] == meta['network'].val) & \
-               (self.df['station'] == meta['station'].val)
+        cond = (self.df["network"] == meta["network"].val) & (
+            self.df["station"] == meta["station"].val)
 
-        df = self.df[cond].set_index(['network', 'station'])
+        df = self.df[cond].set_index(["network", "station"])
 
         # drop potential duplicates, keep first
-        df = df[~df.index.duplicated(keep='first')]
+        df = df[~df.index.duplicated(keep="first")]
 
         vars = []
 
         if df.empty and (self.fill_values is not None):
             vars += self._empty_var(df.columns.values)
         else:
-            for row in df.to_dict('records'):
+            for row in df.to_dict("records"):
                 vars += self._row2var(row)
 
         return MetaData(vars)
@@ -215,25 +214,27 @@ def read_metadata(self, meta: MetaData):
         meta: Metadata
             Additional depth-dependent metadata at the location
         """
-        cond = (self.df['network'] == meta['network'].val) & \
-               (self.df['station'] == meta['station'].val) & \
-               (self.df['instrument'] == meta['instrument'].val) & \
-               (self.df['variable'] == meta['variable'].val) & \
-               (self.df['depth_from'] == meta['instrument'].depth[0]) & \
-               (self.df['depth_to'] == meta['instrument'].depth[1])
-
-        df = self.df[cond].set_index(
-            ['network', 'station', 'instrument', 'variable', 'depth_from', 'depth_to'])
+        cond = ((self.df["network"] == meta["network"].val)
+                & (self.df["station"] == meta["station"].val)
+                & (self.df["instrument"] == meta["instrument"].val)
+                & (self.df["variable"] == meta["variable"].val)
+                & (self.df["depth_from"] == meta["instrument"].depth[0])
+                & (self.df["depth_to"] == meta["instrument"].depth[1]))
+
+        df = self.df[cond].set_index([
+            "network", "station", "instrument", "variable", "depth_from",
+            "depth_to"
+        ])
 
         # drop potential duplicates, keep first
-        df = df[~df.index.duplicated(keep='first')]
+        df = df[~df.index.duplicated(keep="first")]
 
         vars = []
 
         if df.empty and (self.fill_values is not None):
             vars += self._empty_var(df.columns.values)
         else:
-            for row in df.to_dict('records'):
+            for row in df.to_dict("records"):
                 vars += self._row2var(row)
 
         return MetaData(vars)
diff --git a/src/ismn/filecollection.py b/src/ismn/filecollection.py
@@ -31,6 +31,7 @@
 from multiprocessing import Pool, cpu_count
 from operator import itemgetter
 import time
+from typing import Tuple
 
 from ismn.base import IsmnRoot
 from ismn.const import *
@@ -43,7 +44,7 @@ def _read_station_dir(
     stat_dir: Union[Path, str],
     temp_root: Path,
     custom_meta_reader: list,
-) -> (dict, list):
+) -> Tuple[dict, list]:
     """
     Parallelizable function to read metadata for files in station dir
     """
@@ -125,8 +126,7 @@ def _load_metadata_df(meta_csv_file: Union[str, Path]) -> pd.DataFrame:
         index_col=0,
         header=[0, 1],
         low_memory=False,
-        engine="c",
-    )
+        engine="c")
 
     # parse date cols as datetime
     for col in ["timerange_from", "timerange_to"]:
@@ -333,7 +333,7 @@ def from_metadata_df(cls, data_root, metadata_df, temp_root=gettempdir()):
         columns = np.array(list(metadata_df.columns))
 
         for i, row in enumerate(metadata_df.values):
-            #this_nw = row.loc['network', 'val']
+            # this_nw = row.loc['network', 'val']
             vars = np.unique(columns[:-2][:, 0])
             vals = row[:-2].reshape(-1, 3)
 
@@ -394,7 +394,7 @@ def from_metadata_csv(cls,
         metadata_df = _load_metadata_df(meta_csv_file)
 
         if network is not None:
-            metadata_df = metadata_df[np.isin(metadata_df['network'].values,
+            metadata_df = metadata_df[np.isin(metadata_df["network"].values,
                                               network)]
 
         metadata_df.index = range(len(metadata_df.index))

diff --git a/src/ismn/filehandlers.py b/src/ismn/filehandlers.py
@@ -22,6 +22,9 @@
 
 import os
 import pandas as pd
+import warnings
+
+warnings.simplefilter(action="ignore", category=UserWarning)
 from ismn.base import IsmnRoot
 from ismn.components import *
 from ismn import const
@@ -30,7 +33,7 @@
 
 from tempfile import gettempdir, TemporaryDirectory
 from pathlib import Path
-
+from typing import Tuple
 
 class IsmnFile(object):
     """
@@ -203,7 +206,6 @@ def __read_field(data: pd.DataFrame,
         field_vars = []
 
         if fieldname in data.index:
-
             froms = np.atleast_1d(data.loc[fieldname]["depth_from[m]"])
             tos = np.atleast_1d(data.loc[fieldname]["depth_to[m]"])
             vals = np.atleast_1d(data.loc[fieldname]["value"])
@@ -274,8 +276,8 @@ def read_metadata(self) -> MetaData:
         for key in lc_dict.keys():
             if key in lc["quantity_source_name"].values:
                 if key != "insitu":
-                    lc_dict[key] = np.int32(lc.loc[lc["quantity_source_name"] ==
-                                                 key]["value"].values[0])
+                    lc_dict[key] = np.int32(lc.loc[lc["quantity_source_name"]
+                                                   == key]["value"].values[0])
                 else:
                     lc_dict[key] = lc.loc[lc["quantity_source_name"] ==
                                           key]["value"].values[0]
@@ -377,7 +379,7 @@ def __init__(self,
             self.metadata = self.read_metadata(best_meta_for_sensor=True)
 
     @staticmethod
-    def __read_lines(filename: Path) -> (list, list, list):
+    def __read_lines(filename: Path) -> Tuple[list, list, list]:
         """
         Read fist and last line from file as list, skips empty lines.
         """
@@ -595,9 +597,16 @@ def __read_format_header_values(self) -> pd.DataFrame:
             varname + "_orig_flag",
         ]
 
-        return self.__read_csv(names, skiprows=1)
+        return self.__read_csv(
+            names=names,
+            usecols=[0, 1, 2, 3, 4],
+            skiprows=1,
+            sep=" ",
+            low_memory=False,
+            delim_whitespace=False,
+        )
 
-    def __read_csv(self, names=None, usecols=None, skiprows=0):
+    def __read_csv(self, names=None, usecols=None, skiprows=0, **kwargs):
         """
         Read data from csv.
 
@@ -615,24 +624,46 @@ def __read_csv(self, names=None, usecols=None, skiprows=0):
         data : pd.DataFrame
             Time series.
         """
-        readf = lambda f: pd.read_csv(
+
+        def readf(
             f,
-            skiprows=skiprows,
-            usecols=usecols,
             names=names,
-            delim_whitespace=True,
+            usecols=usecols,
+            skiprows=skiprows,
             parse_dates=[[0, 1]],
             engine="c",
-        )
-        if self.root.zip:
+            delim_whitespace=None,
+            sep=None,
+            low_memory=None,
+        ):
+            try:
+                return pd.read_csv(
+                    filepath_or_buffer=f,
+                    skiprows=skiprows,
+                    usecols=usecols,
+                    names=names,
+                    parse_dates=parse_dates,
+                    engine=engine,
+                )
+            except pd.errors.ParserError as text_exception:
+                return pd.read_csv(
+                    filepath_or_buffer=f,
+                    skiprows=skiprows,
+                    usecols=usecols,
+                    names=names,
+                    delim_whitespace=True,
+                    parse_dates=parse_dates,
+                    engine="c",
+                )
 
+        if self.root.zip:
             with TemporaryDirectory(
                     prefix="ismn", dir=self.temp_root) as tempdir:
                 filename = self.root.extract_file(self.file_path, tempdir)
-                data = readf(filename)
+                data = readf(filename, **kwargs)
 
         else:
-            data = readf(self.root.path / self.file_path)
+            data = readf(self.root.path / self.file_path, **kwargs)
 
         data.set_index("date_time", inplace=True)