In [7]:
#######################################################################
##                          yaml program options                     ##
#######################################################################
    
# default program options for basic OBIS run - modify as desired in this section, 
# creates *yml file to supply as config for command line run
# to use first activate python environment, then run as: python mypython.py options.yaml
# T. Wellman, BCB Group, USGS

import yaml, collections, csv, logging, pkg_resources

   
def default_inputs():

    
    # ScienceBase Item ID - search for data files 
    #--------------------------------------------
    collection_id = '57fe93d5e4b0824b2d14cbe1'  # '579b64c6e4b0589fa1c98118' 
    
    #
    # Dictionary of ScienceBase search terms - use list format only, not case sensitive 
    #
    file_srch = dict([('title', ['DarwinCore:event','DarwinCore:occurrence',
                                'DarwinCore:measurementOrFact','final processed source',
                                'Final Processed File']),
                      ('name', ['occurrence', 'event', 'measurementOrFact'])])
    file_srch['ftype_req'] = ['.csv', '.zip']


    #--------------------------------
    # Folder directories
    #--------------------------------

    #
    # main directory to run program (None defaults to current directory)
    #
    workdir = None

    #
    # folder with original (source) csv files
    #
    source_data_dir = './source_data_test'

    #
    # folder with netCDF files (converted from source csv)
    #
    erddap_data_dir = './erddap_data/nc_store'

    #
    # Relative path to the netCDF files on the destination server, for the datasets.xml file.
    #
    server_nc_directory = './erddap_data/nc_store'

    #
    # folder with regenerated csv files (converted from netCDF)
    #
    recon_data_dir = erddap_data_dir + '/recreate_files'

    #
    # folder to store error and other report files 
    #
    report_dir = erddap_data_dir + '/processing_reports'

    #
    # folder to store temporary files when data chunking active (dataframe option)
    #
    tempdir = erddap_data_dir + '/test/'
    
    #
    # path + name (Darwin Core Standard) in json file, "None" to bypass
    #
    darwin_vocab = pkg_resources.resource_filename(__name__, 'data/DarwinCore_vocab.json')
    vocab_name  = 'Darwin Core Standard'
    
    #--------------------------------
    # Processing method (options)
    #--------------------------------

    # Exploratory approaches at processing tabular data
    #  "dataframe" approach to process + convert csv files(xarray/dask/pandas) Tristan Wellman, USGS 
    #  "messytables" approach (csv_reader/messytables))  John Long, USGS
    #
    convert_method = "dataframe" # "messytables"  
    

    #--------------------------------
    # processing flags (options)
    #--------------------------------

    #
    # Whether to fetch metadata from ScienceBase.  If this is True and fetch_csvs
    # is false, it will only fetch metadata.
    #
    fetch_metadata = True 

    #
    # Whether to fetch source files from ScienceBase.
    #
    fetch_csvs = True

    #
    # Whether to create netCDF files from source csv files
    #
    create_netcdf_files = True

    #
    # Whether to create a datasets.xml from the netCDF files found in the erddap_data_dir
    #
    create_datasets_xml = True

    #
    # flag whether to overwrite source files.  If set to False, reprocessing input file will be skipped.
    #
    file_overwrite = False
    
    #
    # flag whether to overwrite converted files.  If set to False, reprocessing input file will be skipped.
    #
    proc_overwrite = False

    #
    # flag whether to regenerate test csv files from netCDF, compare original (source) csv to regenerated csv
    #
    compare_csv2csv = True

    #
    # flag whether to delete regenerated csv file from netCDF, after testing 
    #
    dump_csv = True

    #
    # flag whether to output comparison report file (if compare_csv2csv = True)
    #
    error_report = True

    #
    # flag to print comparison table summaries to screen
    #
    table_output = False

    #
    # maximum errors to report per data column in processing reports (one per file, if error_report active)
    #
    max_report = 50


    #--------------------------------
    # Messy table ONLY options
    #--------------------------------

    #
    # Whether to create a single virtual dataset from multiple netCDFs that were created from a single csv
    #
    create_virtual_datasets = False

    #
    # Whether to turn on verbose logging
    #
    verbose = False

    #
    # Size in number of rows of the sample.  Only takes effect if sample is greater than zero.
    #
    sample_size = 0

    #
    # Number of rows to use to guess column type
    #
    window = 500

    #
    # Max number of rows per netCDF file.  If greater than zero, multiple files will be created if necessary.
    # Zero Value signals to create a single netCDF file for the dataset.
    #
    rows_per_file = 0


    #--------------------------------
    # Dataframe ONLY options
    #--------------------------------
    
    #
    # force convert (date) variables to datetime objects 
    # 'datetime': iso datetime variable, 'string': iso string date, "integer": days since 1-1-1970 basedate, 
    # otherwise : do not convert)
    #
    date_convert = 'string' # 'datetime'
    
    #
    # date format when forced to change (date_convert = True)
    #
    date_fmt = '%Y-%m-%dT%H:%M:%SZ' # "%Y-%m-%d"  

    #
    # file processing chunk size (number of elements, "None" deactivates chunking) 
    #
    chunk_elements = 5e6
    
    #
    # attempt CF compliancy standards
    #
    cf_comply = False
    
    #
    # representation for missing string entries (if == '_FillValue', uses ' ' activates NetCDF _FillValue)
    #
    absent_string = 'NA'
    
    #
    #  specify netcdf type: NETCDF4, NETCDF4_CLASSIC, NETCDF3_64BIT, or NETCDF3_CLASSIC
    #
    netcdf_type = 'NETCDF4_CLASSIC'  
    
    #--------------------------------
    # file comparison options
    #--------------------------------

    #
    # file encoding format - incomplete application
    #
    string_fmt = 'UTF-8' # 'ISO-8859-1'
    
    #
    # flag whether to allow integer-float equivalence, e.g. 1.00 = 1 (yes) 1.0001 = 1 (no)
    #
    int_float_accept = False
    
    #
    # filename modification of source file, when source csv is reconcontructed from netCDF
    #
    fname_ext = '_redo_'
    
    
    #---------------------------------------
    # Misc. specifications (in progress)
    #---------------------------------------
    
    #
    # logging options (optional flag - log to screen, set log level (e.g. debug, info, warning)) 
    #
    log_screen = True
    log_level = logging.DEBUG
    
    #
    # used for testing only, limit processing to # datasets, default is false (off)
    proc_limit = False 

    # note: prefilter *currently* used in dataframe method, postfilter works for dataframe or messytable methods

    #
    # flag whether to prefilter source csv file, filters by "convert_chars" + regex functions 
    #
    prefilter = False

    #
    # flag whether to post-filter file reads during comparisons to allow differences, same technique as prefilter
    #
    postfilter = False
    
    
    #
    # qoute interpreter, default, if None --> autoconfigured
    #
    quote_style = csv.QUOTE_NONNUMERIC # csv.QUOTE_MINIMAL
    

    # dict of character strings to filter from dataset (prefilter or postfilter)
    # key, value : replaced term, modified term
    # note: search keys are not case sensitive 
    convert_chars =  collections.OrderedDict([
        ('\n' , ''),
        (',n/a,'  , ',' + absent_string + ','),
        (',none,'  , ',NA,'),
        (',na,'  , ',NA,') ])
    
    # Character string qoute format - used in Dataframe conversion method and file comparisons,
    # note: dataframe method re-evaluates during NetCDF processing, 
    # inputs below are defaults, may be updated or overwritten internally
    # 
    #
    q_fmt = ['"', '"{}"', "'"]
    qoute_format = '''[\,](?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)'''
    
    #
    # Dictionary of ERDDAP reserved variables (L.L.A.T) specifications
    #
    LLAT_specs = dict([
         ('longitude',{'destinationName':'longitude', 'units':'degrees_east'}),
         ('latitude',{'destinationName':'latitude', 'units':'degrees_north'}),
         ('altitude',{'destinationName':'altitude', 'units':'m', 'positive': 'up'}),
         ('depth', {'destinationName':'depth', 'units':'m', 'positive': 'down'}),
         ('eventDate',{'destinationName':'time'})  
     ])
    
    #
    # Search terms used on variable names, data type must be numeric or string 
    # (adhoc logic, customize as needed)
    #
    numeric_terms = ['_meters', 'minimum_', 'maximum_', '_minimum_', '_maximum', 'decimal', 'in_meters']
    string_terms =  ['_id,', '_code']
    
    #
    # Dictionary of partial variable terms to infer units (adhoc logic, customize as needed)
    # key, value --> term, unit 
    variable_units = dict([
         ('inKg','kg'),
         ('in_meters','m'),
         ('inmeters','m'),
         ('WeightsN','N'),])
  
    # assemble commands dictionary - (arguments and options) 
    # ------------------------------------------------------

    commands = dict([('collection_id', collection_id),
    ('file_srch', file_srch), 
    ('workdir', workdir),
    ('source_data_dir', source_data_dir),
    ('erddap_data_dir', erddap_data_dir),
    ('server_nc_directory', server_nc_directory),
    ('recon_data_dir', recon_data_dir), 
    ('report_dir', report_dir), 
    ('tempdir', tempdir), 
    ('darwin_vocab', darwin_vocab),
    ('vocab_name', vocab_name),                
    ('convert_method', convert_method), 
    ('fetch_metadata', fetch_metadata), 
    ('fetch_csvs', fetch_csvs), 
    ('create_netcdf_files', create_netcdf_files),
    ('create_datasets_xml', create_datasets_xml), 
    ('file_overwrite', file_overwrite),
    ('proc_overwrite', proc_overwrite),                 
    ('compare_csv2csv', compare_csv2csv),
    ('dump_csv', dump_csv), 
    ('error_report', error_report), 
    ('table_output', table_output), 
    ('max_report', max_report), 
    ('create_virtual_datasets', create_virtual_datasets),
    ('verbose', verbose), 
    ('sample_size', sample_size), 
    ('window', window),
    ('rows_per_file', rows_per_file), 
    ('int_float_accept', int_float_accept), 
    ('date_convert', date_convert),
    ('numeric_terms', numeric_terms),                
    ('string_terms', string_terms),                 
    ('variable_units', variable_units),                 
    ('date_fmt', date_fmt),  
    ('string_fmt', string_fmt), 
    ('chunk_elements', chunk_elements),
    ('cf_comply', cf_comply),
    ('absent_string', absent_string), 
    ('netcdf_type', netcdf_type),                        
    ('fname_ext', fname_ext),
    ('log_screen', log_screen),
    ('log_level', log_level), 
    ('proc_limit', proc_limit),                     
    ('prefilter', prefilter), 
    ('postfilter', postfilter),
    ('quote_style', quote_style),
    ('convert_chars', convert_chars),
    ('q_fmt',q_fmt),   
    ('qoute_format', qoute_format),
    ('LLAT_specs' , LLAT_specs),
    ])
    
    
    return commands

In [8]:
#--------------------------------
# create yaml config file
#--------------------------------

# retrieve commands dictionary (modify inputs above)
commands = default_inputs() 

# Write *.YAML file
with open('options.yaml', 'w', encoding='utf8') as outfile:
    yaml.dump(commands, outfile, default_flow_style=False, allow_unicode=True)
    
print('yaml file created in local directory')

yaml file created in local directory
