# Definition: describes BagIt (archive) metadata and file search criteria in ScienceBase 

#### Tristan P. Wellman<br>Science Analytics and Synthesis (SAS)<br>U.S. Geological Survey, Denver, Colorado
#### last modified 2/17/2019

#### Functions:
1) Retrieves default archive metadata,<br>
2) Retrieves default search criteria to select data files from ScienceBase items, and<br> 
3) Customizes defaults via kwargs (optional) 

#### Output: 
returns ordered dictionary of Bagit metadata

In [1]:
import collections
import requests
import datetime
import uuid

In [2]:
# Definition to describe BagIt (archive) metadata 
#
# Functions:
# 1) Specify archive metadata, and 
# 2) Specify search criteria to select data files from ScienceBase items. 
# 3) customize defaults inputs via kwargs 
#
# Output: returns ordered dictionary of Bagit metadata


def archive_options(sbitem, **kwargs):

    archive_func = collections.OrderedDict()
        
    # Archive metadata (defaults) -
    #
    archive_func['archive_meta'] = collections.OrderedDict([
        ('Archive-Tag-Name', 'Archive of ScienceBase Item'),
        ('Archive-Prcessing-Date', datetime.datetime.now().isoformat()),
        ('Archive-Host-Machine', str(uuid.uuid1())),
        ('Archive-Job-Number', str(uuid.uuid4())),
        ('Source-Agency-Name', 'United States Geological Survey'),
        ('Source-Agency-Physical-Address', 'Denver Federal Center, Building 810, Lakewood, Colorado, USA'),
        ('Source-Agency-Group', 'Science Analytics and Synthesis (SAS), Core Science Systems'),
        ('Source-Agency-Contact-Name', 'John Doe'),
        ('Source-Agency-Contact-Phone', '999-999-9999'),
        ('Source-Agency-Contact-Email', 'jdoe@usgs.gov'),
        ('Source-Agency-Data-Source', sbitem['link']['url']),
        ('Source-Agency-Data-Title',sbitem['title']),
    ])
    
#   Search criteria - 
#
#   Inputs: "include" and "exclude" keys with search parameters. 
#          first key (include or exclude) is performed first, 
#          second key is performed second
# 
#   function: selects files to include and/or exclude using search parameters
#
#   Search Parameters
#       1) 'ignore': do not use  
#       2) 'all': selects all files 
#       3) custom (text, regex) search term  e.g. '\.nc' selects files with .nc extension

#   Include all item files except (exclude) those with .nc* file extensions 
#
    archive_func['search'] = collections.OrderedDict([('include' ,'all'), ('exclude' , '\.nc')])
    
    
#   Overwrite or add key-values via **kwargs (optional) 
    if 'custom_meta' in kwargs:
        for key in kwargs['custom_meta']:
            if key in archive_func:
                for sub_key in kwargs['custom_meta'][key]:
                    #print(sub_key)
                    archive_func[key][sub_key] = kwargs['custom_meta'][key][sub_key]
            else:
                archive_func[key] = kwargs['custom_meta'][key]
            

    return archive_func

In [3]:
# For Examples (below): request one Sciencebase item json 
url = 'https://www.sciencebase.gov/catalog/item/57fe9d82e4b0824b2d14f221'
sb_item = requests.get(url + '?format=json').json()

In [4]:
# Example: get default BagIt (archive) metadata 
archive_func = archive_options(sb_item)
archive_func

OrderedDict([('archive_meta',
              OrderedDict([('Archive-Tag-Name', 'Archive of ScienceBase Item'),
                           ('Archive-Prcessing-Date',
                            '2019-02-22T09:23:42.620325'),
                           ('Archive-Host-Machine',
                            '38352326-36be-11e9-9f0a-f45c898ede93'),
                           ('Archive-Job-Number',
                            'c3a235d2-439b-48dc-be11-5236a91ff21b'),
                           ('Source-Agency-Name',
                            'United States Geological Survey'),
                           ('Source-Agency-Physical-Address',
                            'Denver Federal Center, Building 810, Lakewood, Colorado, USA'),
                           ('Source-Agency-Group',
                            'Science Analytics and Synthesis (SAS), Core Science Systems'),
                           ('Source-Agency-Contact-Name', 'John Doe'),
                           ('Source-Agency-Contact-Pho

In [5]:
# Example: customize default BagIt (archive) metadata for OBIS-USA collection

# customize metadata record
aux_dict = {}
aux_dict['archive_meta'] = collections.OrderedDict([
        ('Archive-Tag-Name', 'OBIS_USA Archive'),
        ('Source-Agency-Contact-Name', 'Abby Benson'),
        ('Source-Agency-Contact-Email', 'abenson@usgs.gov'),
        ('Source-Agency-Contact-Phone', '303.202.4087'),
    ])


archive_func = archive_options(sb_item, custom_meta = aux_dict)
archive_func

OrderedDict([('archive_meta',
              OrderedDict([('Archive-Tag-Name', 'OBIS_USA Archive'),
                           ('Archive-Prcessing-Date',
                            '2019-02-22T09:23:42.653918'),
                           ('Archive-Host-Machine',
                            '383a43d0-36be-11e9-878b-f45c898ede93'),
                           ('Archive-Job-Number',
                            '74be35ee-e603-4971-97bb-70bbfa96fc83'),
                           ('Source-Agency-Name',
                            'United States Geological Survey'),
                           ('Source-Agency-Physical-Address',
                            'Denver Federal Center, Building 810, Lakewood, Colorado, USA'),
                           ('Source-Agency-Group',
                            'Science Analytics and Synthesis (SAS), Core Science Systems'),
                           ('Source-Agency-Contact-Name', 'Abby Benson'),
                           ('Source-Agency-Contact-Phone', '30

In [None]:
# modify search criteria in a similar manner
# uses simple include and exclude logic to search ScienceBase item

#   Search Parameters
#       1) 'ignore': do not use  
#       2) 'all': selects all files 
#       3) custom (text, regex) search term  e.g. '\.nc' selects files with .nc extension

#   Include all item files except (exclude) those with .nc* file extensions 
#
#    archive_func['search'] = collections.OrderedDict([('include' ,'all'), ('exclude' , '\.nc')])