In [1]:
import obis

In [2]:
dir(obis)

['__builtins__',
 '__cached__',
 '__content__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__revisionid__',
 '__spec__',
 '__version__',
 '_pkginfo']

In [3]:
# The pkginfo tag provides module build information,
# perhaps a way to track versions through a data pipeline.
# A json file is read automatically into the module upon import, as shown below.
# The json file enables external referencing by other utilities, including messaging services.

obis._pkginfo

{'build': '0x000000',
 'content': 'BCB_OBIS processor package identification',
 'errors': 'None',
 'revisionid': 'd2807f09-abc9-400e-819b-5ed69b97f8a2',
 'version': '0.0.2'}

In [5]:
# Description of module

print(help(obis))

Help on package obis:

NAME
    obis

DESCRIPTION
    obis processor - An OBIS processor package for Python,
    A slice of the Biogeographic Information System
    
    This script downloads final source CSV files from the OBIS-USA collection, converts them to
    netCDF and stages them for consumption by ERDDAP.  It also generates the datasets.xml ERDDAP
    configuration file to serve those netCDF files as data sets.
    
    Main Features
    -------------
    Functionality is limited (work in progress) to a few major tasks:
    
      - Retrieves source csv files and metadata information from ScienceBase
      - Processes source csv file using regex to determine formatting
      - Converts specified source CSV to NetCDF with a single "index" (row) dimension
      - Regenerates source CSV from NetCDF and compares csv files
      - Uses xarray-dask dataframe or messytables conversion approach
      - Controls memory consumption using data chunking
      - Incorporates basic error ch

In [5]:
# Run default example to process *.csv files stored in a ScienceBase item: 
# https://www.sciencebase.gov/catalog/item/57fe93d5e4b0824b2d14cbe1
# Otherwise, uses default configurations

# outputs logs (no standard out)

from obis import processor
obis.processor.run()


2018-05-15 17:15:01.560 -  MESSAGE - ** Processor activated **
2018-05-15 17:15:01.568 -     INFO - ** Searching ScienceBase records for data files and metdata **
2018-05-15 17:15:02.148 -    DEBUG -    1 DryTortugasReefVisualCensus2004_Event.csv
2018-05-15 17:15:02.151 -    DEBUG -     SBase date: 2017-05-16T16:05:24Z, local date: 2018-05-14T15:05:03Z
2018-05-15 17:15:02.152 -    DEBUG -     local source file is up to date
2018-05-15 17:15:02.153 -    DEBUG -    2 DryTortugasReefVisualCensus2004_measurementOrFact.csv
2018-05-15 17:15:02.155 -    DEBUG -     SBase date: 2017-05-16T16:06:37Z, local date: 2018-05-14T09:56:12Z
2018-05-15 17:15:02.157 -    DEBUG -     local source file is up to date
2018-05-15 17:15:02.158 -    DEBUG -    3 DryTortugasReefVisualCensus2004_occurence.csv
2018-05-15 17:15:02.161 -    DEBUG -     SBase date: 2017-07-25T17:18:51Z, local date: 2018-05-14T09:56:30Z
2018-05-15 17:15:02.162 -    DEBUG -     local source file is up to date
2018-05-15 17:15:02.163 - 

In [6]:
# In notebook, retrieve default input commands 

obis.processor.default_inputs()

{'LLAT_specs': {'altitude': {'destinationName': 'altitude',
   'positive': 'up',
   'units': 'm'},
  'depth': {'destinationName': 'depth', 'positive': 'down', 'units': 'm'},
  'eventDate': {'destinationName': 'time'},
  'latitude': {'destinationName': 'latitude', 'units': 'degrees_north'},
  'longitude': {'destinationName': 'longitude', 'units': 'degrees_east'}},
 'absent_string': 'NA',
 'cf_comply': False,
 'chunk_elements': 5000000.0,
 'collection_id': '57fe93d5e4b0824b2d14cbe1',
 'compare_csv2csv': True,
 'convert_chars': OrderedDict([('\n', ''),
              (',n/a,', ',NA,'),
              (',none,', ',NA,'),
              (',na,', ',NA,')]),
 'convert_method': 'dataframe',
 'create_datasets_xml': True,
 'create_netcdf_files': True,
 'create_virtual_datasets': False,
 'darwin_vocab': 'DarwinCore_vocab_2018-03-13T17-03-40Z.json',
 'date_convert': 'datetime',
 'date_fmt': '%Y-%m-%dT%H:%M:%SZ',
 'dump_csv': True,
 'erddap_data_dir': './erddap_data/nc_store',
 'error_report': True,
 

In [7]:
# Example - modify default inputs.
# Default processing commands can be modfied,
# using a dictionary of keywords and values as arguments,
# or (not shown) using the command line as sys.argv or *.yaml file 

# Default operation is to processes a specific ScienceBase item in the OBIS Collection,
# i.e. https://www.sciencebase.gov/catalog/item/57fe93d5e4b0824b2d14cbe1

# The following example is used to (1) change the ScienceBase item number,
# (2) disable the option to create a processing report,
# (3) change the maximum reporting errors per data column to 100 instead of 50 (default), and
# (4) change the logging level to >= info level (20) to eliminate debug level reporting).    
    
# alt_args = {'collection_id': '57fe9d82e4b0824b2d14f221', 
#            'error_report': False, 'max_report': 100,
#            'log_level': 20}

# can operate using processor.run(**alt_args)
#
# or using discrete inputs:

processor.run(collection_id = '57fe9d82e4b0824b2d14f221', 
            error_report = False, max_report= 100, log_level = 20)

2018-05-15 17:16:46.331 -  MESSAGE - ** Processor activated **
2018-05-15 17:16:46.338 -     INFO - ** Searching ScienceBase records for data files and metdata **
2018-05-15 17:16:46.830 -     INFO -    Sciencebase search summary: downloaded files  0
2018-05-15 17:16:46.832 -     INFO -    Sciencebase search summary: scibase_size_mb  173.972633
2018-05-15 17:16:46.832 -     INFO -    Sciencebase search summary: download_size_mb  0.0
2018-05-15 17:16:46.833 -     INFO -    Sciencebase search summary: total files  8
2018-05-15 17:16:46.835 -     INFO -    Sciencebase search summary: items to search  1
2018-05-15 17:16:46.836 -     INFO -    Sciencebase search summary: candidate files  3
2018-05-15 17:17:11.309 -     INFO - ** Writing Datasets.xml file for ERDDAP
2018-05-15 17:17:11.506 -  MESSAGE - ** Processor terminated **




In [None]:
# same setup as above, but without printing to console, log to file only

alt_args = {'collection_id': '57fe9d82e4b0824b2d14f221', 
            'error_report': False, 'max_report': 100,
            'log_level': 20, 'log_screen' : False}
processor.run(**alt_args)

In [8]:
# print log file from previous runs 

with open('obis_processor.log', 'r') as log_file:
    for f in log_file:
        print(f.replace('\n',''))

2018-05-15 17:15:01.560 obis.processor:  MESSAGE  ** Processor activated **
2018-05-15 17:15:01.568 obis.processor:  INFO     ** Searching ScienceBase records for data files and metdata **
2018-05-15 17:15:02.148 obis.processor:  DEBUG       1 DryTortugasReefVisualCensus2004_Event.csv
2018-05-15 17:15:02.151 obis.processor:  DEBUG        SBase date: 2017-05-16T16:05:24Z, local date: 2018-05-14T15:05:03Z
2018-05-15 17:15:02.152 obis.processor:  DEBUG        local source file is up to date
2018-05-15 17:15:02.153 obis.processor:  DEBUG       2 DryTortugasReefVisualCensus2004_measurementOrFact.csv
2018-05-15 17:15:02.155 obis.processor:  DEBUG        SBase date: 2017-05-16T16:06:37Z, local date: 2018-05-14T09:56:12Z
2018-05-15 17:15:02.157 obis.processor:  DEBUG        local source file is up to date
2018-05-15 17:15:02.158 obis.processor:  DEBUG       3 DryTortugasReefVisualCensus2004_occurence.csv
2018-05-15 17:15:02.161 obis.processor:  DEBUG        SBase date: 2017-07-25T17:18:51Z, lo