Skip to content

Commit

Permalink
Bug fix for __read_csv() method to now read RISMA network data of "he…
Browse files Browse the repository at this point in the history
…ader & files" format again (#67)

* PEP8 and typing

reformatted code according to PEP8 style guide and added  missing type

* changed __read_csv

function instead of lambda function inside __read_csv

* pep8

* updated __read_csv

__read_csv had problems with reading faulty sensor files. This was now fixed by introducing **kwargs as additonal arguments

* small correction and pep8

* fix: now all files of format 'header values' can be parsed correctly by pandas

kept the **kwargs in the __read_csv method and added exception 'pd.erros.ParserError as text_exception'

* preparation for sphinx

started preparation for a sphinx documentation

* deleted sphinx files

* test

* removed comments

* added short file describing the original problem and the fix, added test data

* minor revision

* minor revision

* minor revision

* minor revision

* minor revision

* minor revision

* new branch for USDA-ARS debugging

* pep8 style check now done by yapf

* smal adaptions

* did the required changes for pull request

* yapf pep8

* exchanged tuple[] with typing.Tuple[]

* added line to CHANGELOG.rst

* added line to CHANGELOG.rst to correct position this time

* Removed the now ignored directory ".vscode"

* Deleted "RISMA_test_data" directory
  • Loading branch information
nfb2021 authored May 30, 2023
1 parent 08ce411 commit bcb9c37
Show file tree
Hide file tree
Showing 9 changed files with 113 additions and 81 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,7 @@ MANIFEST
**/python_metadata/*
networks/*
.artifacts/*
.coverage*
.coverage*

# VSC config file
.vscode/*
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Changelog
Unreleased changes in master branch
===================================

-
- Fixed bug with parsing sensor files for RISMA network in "header & files" format

Version 1.3.3
=============
Expand Down
6 changes: 2 additions & 4 deletions src/ismn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ def clean_subpath(self, subpath) -> Union[Path, PurePosixPath]:
subpath = PurePosixPath(subpath)
else:
if not (self.path / Path(subpath)).exists():
raise ValueError(
f"Subpath {subpath} does not exist"
f" in archive {self.path}"
)
raise ValueError(f"Subpath {subpath} does not exist"
f" in archive {self.path}")

return subpath

Expand Down
5 changes: 3 additions & 2 deletions src/ismn/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,8 +686,9 @@ def __repr__(self, indent: str = ""):
for net in self.networks.values()
])

def __getitem__(self, item: Union[int, str, list]) -> \
Union["NetworkCollection", Network]:
def __getitem__(
self, item: Union[int, str,
list]) -> Union["NetworkCollection", Network]:
# shortcut to access networks directly
if isinstance(item, (int, str)):
if isinstance(item, int):
Expand Down
15 changes: 7 additions & 8 deletions src/ismn/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,16 +108,15 @@ class DepthError(ValueError):
"quantity_source_name",
]

lc_num_vars = ['lc_2000', 'lc_2005', 'lc_2010']
lc_str_vars = ['lc_insitu']
lc_num_vars = ["lc_2000", "lc_2005", "lc_2010"]
lc_str_vars = ["lc_insitu"]
LC_VARS = [*lc_num_vars, *lc_str_vars]
CLIM_VARS = ['climate_KG', 'climate_insitu']
CLIM_VARS = ["climate_KG", "climate_insitu"]

CSV_META_TEMPLATE_SURF_VAR = OrderedDict(
[(c, np.nan) for c in lc_num_vars] +
[(c, 'unknown') for c in lc_str_vars] +
[(c, 'unknown') for c in CLIM_VARS]
)
CSV_META_TEMPLATE_SURF_VAR = OrderedDict([(c, np.nan) for c in lc_num_vars] +
[(c, "unknown")
for c in lc_str_vars] +
[(c, "unknown") for c in CLIM_VARS])
CSV_META_TEMPLATE_GROUND_VAR = OrderedDict([
("saturation", np.nan),
("clay_fraction", np.nan),
Expand Down
75 changes: 38 additions & 37 deletions src/ismn/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,21 @@ class CustomStationMetadataCsv(CustomMetaReader):

def __init__(self, station_meta_csv, fill_values=None, **kwargs):
"""
Parameters
----------
station_meta_csv: str
Path to the csv file with the above described content
fill_values: dict, optional (default: None)
Values to use for a certain custom metadata variable, if no
match is found.
kwargs:
Additional kwargs as passed to :func:`pandas.read_csv`
To use a different separator than the default semicolon, use `sep`
Parameters
----------
station_meta_csv: str
Path to the csv file with the above described content
fill_values: dict, optional (default: None)
Values to use for a certain custom metadata variable, if no
match is found.
kwargs:
Additional kwargs as passed to :func:`pandas.read_csv`
To use a different separator than the default semicolon, use `sep`
"""
if 'sep' in kwargs:
sep = kwargs.pop('sep')
if "sep" in kwargs:
sep = kwargs.pop("sep")
else:
sep = ';'
sep = ";"

self.fill_values = dict() if fill_values is None else fill_values
self.df = pd.read_csv(station_meta_csv, sep=sep, **kwargs)
Expand All @@ -108,9 +108,8 @@ def _empty_var(self, varnames) -> list:
"""
vars = []
for var in varnames:
if var in self.fill_values.keys() and \
not (var.endswith('_depth_from') or
var.endswith('_depth_to')):
if var in self.fill_values.keys() and not (
var.endswith("_depth_from") or var.endswith("_depth_to")):
vars.append(MetaVar(var, self.fill_values[var]))
return vars

Expand All @@ -122,15 +121,15 @@ def _row2var(row: dict) -> list:
vars = []

for k, v in row.items():
if k.endswith('_depth_from') or k.endswith('_depth_to'):
if k.endswith("_depth_from") or k.endswith("_depth_to"):
continue

if f'{k}_depth_from' in row:
depth_from = row[f'{k}_depth_from']
if f"{k}_depth_from" in row:
depth_from = row[f"{k}_depth_from"]
else:
depth_from = None
if f'{k}_depth_to' in row:
depth_to = row[f'{k}_depth_to']
if f"{k}_depth_to" in row:
depth_to = row[f"{k}_depth_to"]
else:
depth_to = None

Expand Down Expand Up @@ -166,20 +165,20 @@ def read_metadata(self, meta: MetaData):
"""

cond = (self.df['network'] == meta['network'].val) & \
(self.df['station'] == meta['station'].val)
cond = (self.df["network"] == meta["network"].val) & (
self.df["station"] == meta["station"].val)

df = self.df[cond].set_index(['network', 'station'])
df = self.df[cond].set_index(["network", "station"])

# drop potential duplicates, keep first
df = df[~df.index.duplicated(keep='first')]
df = df[~df.index.duplicated(keep="first")]

vars = []

if df.empty and (self.fill_values is not None):
vars += self._empty_var(df.columns.values)
else:
for row in df.to_dict('records'):
for row in df.to_dict("records"):
vars += self._row2var(row)

return MetaData(vars)
Expand Down Expand Up @@ -215,25 +214,27 @@ def read_metadata(self, meta: MetaData):
meta: Metadata
Additional depth-dependent metadata at the location
"""
cond = (self.df['network'] == meta['network'].val) & \
(self.df['station'] == meta['station'].val) & \
(self.df['instrument'] == meta['instrument'].val) & \
(self.df['variable'] == meta['variable'].val) & \
(self.df['depth_from'] == meta['instrument'].depth[0]) & \
(self.df['depth_to'] == meta['instrument'].depth[1])

df = self.df[cond].set_index(
['network', 'station', 'instrument', 'variable', 'depth_from', 'depth_to'])
cond = ((self.df["network"] == meta["network"].val)
& (self.df["station"] == meta["station"].val)
& (self.df["instrument"] == meta["instrument"].val)
& (self.df["variable"] == meta["variable"].val)
& (self.df["depth_from"] == meta["instrument"].depth[0])
& (self.df["depth_to"] == meta["instrument"].depth[1]))

df = self.df[cond].set_index([
"network", "station", "instrument", "variable", "depth_from",
"depth_to"
])

# drop potential duplicates, keep first
df = df[~df.index.duplicated(keep='first')]
df = df[~df.index.duplicated(keep="first")]

vars = []

if df.empty and (self.fill_values is not None):
vars += self._empty_var(df.columns.values)
else:
for row in df.to_dict('records'):
for row in df.to_dict("records"):
vars += self._row2var(row)

return MetaData(vars)
10 changes: 5 additions & 5 deletions src/ismn/filecollection.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from multiprocessing import Pool, cpu_count
from operator import itemgetter
import time
from typing import Tuple

from ismn.base import IsmnRoot
from ismn.const import *
Expand All @@ -43,7 +44,7 @@ def _read_station_dir(
stat_dir: Union[Path, str],
temp_root: Path,
custom_meta_reader: list,
) -> (dict, list):
) -> Tuple[dict, list]:
"""
Parallelizable function to read metadata for files in station dir
"""
Expand Down Expand Up @@ -125,8 +126,7 @@ def _load_metadata_df(meta_csv_file: Union[str, Path]) -> pd.DataFrame:
index_col=0,
header=[0, 1],
low_memory=False,
engine="c",
)
engine="c")

# parse date cols as datetime
for col in ["timerange_from", "timerange_to"]:
Expand Down Expand Up @@ -333,7 +333,7 @@ def from_metadata_df(cls, data_root, metadata_df, temp_root=gettempdir()):
columns = np.array(list(metadata_df.columns))

for i, row in enumerate(metadata_df.values):
#this_nw = row.loc['network', 'val']
# this_nw = row.loc['network', 'val']
vars = np.unique(columns[:-2][:, 0])
vals = row[:-2].reshape(-1, 3)

Expand Down Expand Up @@ -394,7 +394,7 @@ def from_metadata_csv(cls,
metadata_df = _load_metadata_df(meta_csv_file)

if network is not None:
metadata_df = metadata_df[np.isin(metadata_df['network'].values,
metadata_df = metadata_df[np.isin(metadata_df["network"].values,
network)]

metadata_df.index = range(len(metadata_df.index))
Expand Down
61 changes: 46 additions & 15 deletions src/ismn/filehandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@

import os
import pandas as pd
import warnings

warnings.simplefilter(action="ignore", category=UserWarning)
from ismn.base import IsmnRoot
from ismn.components import *
from ismn import const
Expand All @@ -30,7 +33,7 @@

from tempfile import gettempdir, TemporaryDirectory
from pathlib import Path

from typing import Tuple

class IsmnFile(object):
"""
Expand Down Expand Up @@ -203,7 +206,6 @@ def __read_field(data: pd.DataFrame,
field_vars = []

if fieldname in data.index:

froms = np.atleast_1d(data.loc[fieldname]["depth_from[m]"])
tos = np.atleast_1d(data.loc[fieldname]["depth_to[m]"])
vals = np.atleast_1d(data.loc[fieldname]["value"])
Expand Down Expand Up @@ -274,8 +276,8 @@ def read_metadata(self) -> MetaData:
for key in lc_dict.keys():
if key in lc["quantity_source_name"].values:
if key != "insitu":
lc_dict[key] = np.int32(lc.loc[lc["quantity_source_name"] ==
key]["value"].values[0])
lc_dict[key] = np.int32(lc.loc[lc["quantity_source_name"]
== key]["value"].values[0])
else:
lc_dict[key] = lc.loc[lc["quantity_source_name"] ==
key]["value"].values[0]
Expand Down Expand Up @@ -377,7 +379,7 @@ def __init__(self,
self.metadata = self.read_metadata(best_meta_for_sensor=True)

@staticmethod
def __read_lines(filename: Path) -> (list, list, list):
def __read_lines(filename: Path) -> Tuple[list, list, list]:
"""
Read fist and last line from file as list, skips empty lines.
"""
Expand Down Expand Up @@ -595,9 +597,16 @@ def __read_format_header_values(self) -> pd.DataFrame:
varname + "_orig_flag",
]

return self.__read_csv(names, skiprows=1)
return self.__read_csv(
names=names,
usecols=[0, 1, 2, 3, 4],
skiprows=1,
sep=" ",
low_memory=False,
delim_whitespace=False,
)

def __read_csv(self, names=None, usecols=None, skiprows=0):
def __read_csv(self, names=None, usecols=None, skiprows=0, **kwargs):
"""
Read data from csv.
Expand All @@ -615,24 +624,46 @@ def __read_csv(self, names=None, usecols=None, skiprows=0):
data : pd.DataFrame
Time series.
"""
readf = lambda f: pd.read_csv(

def readf(
f,
skiprows=skiprows,
usecols=usecols,
names=names,
delim_whitespace=True,
usecols=usecols,
skiprows=skiprows,
parse_dates=[[0, 1]],
engine="c",
)
if self.root.zip:
delim_whitespace=None,
sep=None,
low_memory=None,
):
try:
return pd.read_csv(
filepath_or_buffer=f,
skiprows=skiprows,
usecols=usecols,
names=names,
parse_dates=parse_dates,
engine=engine,
)
except pd.errors.ParserError as text_exception:
return pd.read_csv(
filepath_or_buffer=f,
skiprows=skiprows,
usecols=usecols,
names=names,
delim_whitespace=True,
parse_dates=parse_dates,
engine="c",
)

if self.root.zip:
with TemporaryDirectory(
prefix="ismn", dir=self.temp_root) as tempdir:
filename = self.root.extract_file(self.file_path, tempdir)
data = readf(filename)
data = readf(filename, **kwargs)

else:
data = readf(self.root.path / self.file_path)
data = readf(self.root.path / self.file_path, **kwargs)

data.set_index("date_time", inplace=True)

Expand Down
Loading

0 comments on commit bcb9c37

Please sign in to comment.