In [None]:
## ERDDAP-ScienceBase Processing Tool (exploration phase)
##### A simple protoype


##### Author:
Tristan Wellman
<br>twellman@usgs.gov
<br>Biogeogeographic Characterization Branch
<br>Core Science Analytics, Synthesis, and Libraries
<br>U.S. Geological Survey, Denver, Colorado


##### Description:  
This python code performs initial steps in searching, 
<br>processing, and quality controlling datasets retrieved 
<br>from an ERDDAP data server, which are populated in 
<br>ScienceBase as persistent web items.

The basic idea here is to interact with ERDDAP in a simple way.
<br>The code automates some of the tedium in retrieving data, error 
<br>checking, setting up and updating a ScienceBase item for 
<br>each dataset examined, and pushes the data to ScienceBase.

##### Functions:
    (a) search ERDDAP by category, protocol, keyword       
    (b) request / process data using 'advanced search url'
    (c) examine / QC metadata and other information  
    (d) create / modify ScienceBase records  
    (e) Search for duplicate records by title, filenames, and dataset ID  

##### Code Status:
    Beta 0.3 1/30/2017 
    A work in progress, testing and refining
    

## Operational Information 

#####      Input Files:
    Two local files are required for operation in the working directory:
    
    1) A JSON dictionary file of operation inputs ("name_file_dict.json"). 
       The included script ("generate_label_dictionary.ipnb") can accomodate
       some edits and will generate this file in the working directory 
       ("tempdir" in "name_file_dict.json"). 

    2) A status file showing processing dates, contacts, processing uuid,  
       ERDDAP advanced search url, filenames, and upload and download status.
       This file can be re-used if the URL_Flag option in (1) is True. 
       Otherwise, a new status file will be created.

#####   Operation Flags:
     Flags are stored in the JSON dictionary file ("name_file_dict.json"):

     1) Whether to use an existing status file
        ['url_flag'] = True or False 
     2) Whether to overwrite or skip data file processing (proc):
        ['purge_proc'] = True or False or 'skip'
     3) Whether to overwrite or skip ScienceBase file processing (sb):
        ['purge_sb'] = True or False or 'skip'

#####   To Run:
    a) Set status filename, main flags, and other information for the JSON dictionary, 
       run dictionary script to create file (i.e. run "generate_label_dictionary.ipnb"). 
       Or copy an existing file into the working directory and adjust inputs manually.
    b) Prescribe the temporary working directory (tempdir), immediately below. 
    c) Save code/files as appropriate.
    d) Run this code (all cells).
    e) Note: users (['login_name'] in JSON dictionary file) will be required to login to 
       ScienceBase when items are being created or modified.
    
#####   Output:
    a) Creates/updates a status file in the working directory (see explanation above).
    b) Downloads data and metadata files to a folder, each named by dataset ID.
    c) Creates a misc. information file (info_dict.json) for each dataset.
    d) Creates/updates ScienceBase records.
    e) Creates date-time stamped input files in folder "Prov_files". 


In [29]:
#  working directory (temporary), should match 'local directory' in status file if url_flag = True 

# tempdir = "/Users/twellman/Documents/BCB_data_projects/OBIS_usa_database/erddap_MBON_test2/"
# tempdir = "/Users/twellman/Documents/BCB_data_projects/OBIS_usa_database/PacIOOS/"
tempdir = "/Users/twellman/Documents/BCB_data_projects/OBIS_usa_database/NOAA/"
tempdir

'/Users/twellman/Documents/BCB_data_projects/OBIS_usa_database/NOAA/'

In [5]:
# python modules to load

import numpy as np
import time
import pprint
import pysb
import uuid
import re
import requests
import pandas as pd
from dateutil import parser
import datetime
import sys
import json
from os import path, makedirs, remove
from shutil import copyfileobj
from urllib import urlencode, quote 
from collections import OrderedDict
from IPython.core.display import display
from timeit import default_timer as timer
from lxml import etree, objectify

print('modules loaded')

modules loaded


In [60]:
# *******************************************************
#    Definitions:  status = draft, work in progress 
# *******************************************************

# write xml to python dictionary using recursion
def xml_to_dict(xml_str):
    def xml_to_dict_recursion(xml_object):
        dict_object = xml_object.__dict__
        if not dict_object:  # if empty dict returned
            return xml_object
        for key, value in dict_object.items():
            dict_object[key] = xml_to_dict_recursion(value)
        return dict_object
    xml_obj = objectify.fromstring(xml_str)
    return {xml_obj.tag: xml_to_dict_recursion(xml_obj)}

# Adapt to Python v2+ ascii conversion issues (apparently fixed in python 3) 
def removeNonAscii(df_ascii): 
    df_ascii = df_ascii.apply(
        lambda x: ''.join([i if 32 < ord(i) < 126 else " " for i in x]))
    return df_ascii

# Basic stats on requested ERDDAP data table (super simple for now)
def get_stats(df_o):
    agencies, counts = np.unique(df_o.loc[df_o["Institution"] != ("???" or "")]["Institution"],return_counts=True)    
    agencies = np.append(agencies, "Unknown")
    counts = np.append(counts, df_o.shape[0] - np.sum(counts))
    stats = [df_o["griddap"].count(),
             np.count_nonzero(df_o["tabledap"]!=""),
             np.count_nonzero(df_o["griddap"]!=""),
             len(agencies)-1 ]
    print( "\n***** Query Results *****\n\nTotal number of datasets: %d\nNumber of tables: %d\nNumber of grids: %d\
        \nNumber of agency groups providing data: %d\n" % tuple(stats))
    header =  ["** Agency Groups ** ", "** Dataset count **"]
    a = max(len(str(max(agencies))),len(header[0]))
    c = max(len(str(max(counts))),len(header[0]))
    f  = '\t{0:<%d}\t{1:<%d}' % (a, c)
    print(f.format(header[0], header[1]))
    for p in zip(agencies, counts):
        print(f.format(p[0],p[1]))
    return

# Show table
def viewtable(df_view, cw, mr):
    pd.set_option('display.max_colwidth', cw)
    df_view.describe() 
    sys.stdout.write("\nDisplay limited to a maximmum of %d datasets:\n" % mr)
    df_view.columns.name = "Data Index"
    display(df_view.ix[0:mr-1]) # assume 0 start index
    return

# bulk data download options: all data, all tables, or all grids  - refine later **
def bulkoption(df):
    bulkdict = {}
    url = []
    opts = ["D", "T", "G", "C"]
    task = ["all datasets", "all tables", "all grids", "by category"]
    resource = [ "info" , "tabledap", "griddap", "categorize"]
    
    for r in resource:
        url.append(''.join(df.loc[df['Resource'] == r]['URL']))      
    combo = zip(resource, task, url)
    
    # combine search info into dictionary
    bulk_opts = zip(opts, combo)
    for o, t in bulk_opts:
        bulkdict[o] = t
    return bulkdict

# Generate url based on category and/or word/phrase search, 
# Note: doesn't include value constraints - a bit clunky, refine later **
def build_url(srch_word, baseurl, protocol, cat_table, cindx, subcategory):
    adv_url_html =  "{}{}".format(baseurl, "/search/advanced.html?")
    prot_dict = {}
    prot_dict = {"A" : "(any)", "G" : "griddap",
                 "W" : "WMS","T" : "tabledap"}
    val_dict = OrderedDict()
    val_dict = {"maxLat": "", "minLon": "", "maxLon": "",
                "minLat": "", "minTime": "", "maxTime": ""}
    udict = OrderedDict()
    udict["searchFor"] = srch_word
    udict["protocol"] = prot_dict[protocol]
    for c in cat_table.Categorize:
        udict[c] = "(any)"
    if subcategory !="": 
        udict[cat_table.get_value(cindx,"Categorize", takeable=False)] = subcategory
    url_dict = udict.copy()
    url_dict.update(val_dict)
    data = urlencode(url_dict)
    gen_url_html =  "{}{}".format(adv_url_html, data)
    quote(gen_url_html, safe='') 
    gen_url_json = gen_url_html
    gen_url_json = gen_url_json.replace(".html",".json")  
    return gen_url_html, gen_url_json

# Attempt url request, report error if encountered 
def url_request(url,rtype):
    try:
        if rtype == 'json':
            r = requests.get(url).json()
#            r.raise_for_status()
        else:
            r = requests.get(url, stream=True)
#            r.raise_for_status()
    except ValueError as e:  
        sys.stdout.write("\n{}{}\n\n{}".format('** Flag ** : ', e,"Search criteria could be too restrictive"))
        sys.stdout.write("Error: {}\n{}".format(e, r.status_code)) 
        err = 'failed'
    except requests.exceptions.Timeout as e:
        sys.stdout.write("\n{}".format("\n** Flag ** : system timeout - retry later")) 
        sys.stdout.write("Error: {}\n{}".format(e, r.status_code)) 
        err = 'failed'
    except requests.exceptions.HTTPError as e:
        sys.stdout.write("\n** Flag ** : http request error") 
        sys.stdout.write("Error: {}\n{}".format(e, r.status_code)) 
        err = 'failed'
    except requests.exceptions.RequestException as e:
        sys.stdout.write("\n** Flag ** : systematic exception error\nSearch url may need revision") 
        sys.stdout.write("Error: {}\n{}".format(e, r.status_code)) 
        err = 'failed'
    else:
        err = 'passed'
    return r, err

# evaluate whether file directory exists and read/create status file (status)
def chk_files(directory, nf_dict, task):
    if task =='initialize':
        file_sb = "{}{}".format(directory, nf_dict['init']['sb_json'])
        if path.isfile(file_sb) and nf_dict['init']['url_flag'] != False:
            with open(file_sb) as json_data:
                status = json.load(json_data)
            if len(status[0]['general information']) == 10:
                status[0]['general information'].append({u'processing uuid' : str(uuid.uuid1())}) 
            if 'json' in status[0]['general information'][5]['data search json']:
                url_chk = True
            else:
                sys.stdout.write("\nNote: adv. search url was not found in sb_json (download status) file")
                url_chk = False
            if status[1]['local_directory'] != tempdir:
                sys.stdout.write("{}{}".format('\nWarning - local directory does not match entry in status file', 
                    'overwriting status file with current local directory'))
                status[1]['local_directory'] = tempdir
        else:
            sys.stdout.write("\nNote: a new sb_json (status file) is being created")
            url_chk = False
         
    # generate new status file, if warranted
        if url_chk == False:
            date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            status = [{'general information': "none"},{'local_directory': directory},{'datasets (id)': {}}] 
            status[0]['general information'] = nf_dict[u'general information']
            status[0]['general information'][3]['data base url'] = nf_dict['init']['baseurl'] 
            status[0]['general information'][1]['file created (date-time)'] = date
            status[0]['general information'][10]['processing uuid'] = str(uuid.uuid1()) 
            status[1]['local_directory'] = tempdir
            with open(file_sb, 'w') as fp:
                json.dump(status, fp, indent=4) 
                                 
        nf_dict['init']['url_flag'] = url_chk
        return status
    else:
        try:
            if not path.exists(directory):
                makedirs(directory)
        except:
            return
        
#  Search, request, and ultimately retrieve data information
def retrieve_data(nf_dict, SB_status):
    
    if nf_dict['init']['url_flag'] != True:

        subdir   = '/index'  
        url_o = "".join((nf_dict['init']['baseurl'], subdir, nf_dict['init']['infotype'] ))
        sys.stdout.write("\nHome URL: %s \n\n" % (url_o.rsplit( ".", 1 )[ 0 ] + 
            nf_dict['init']['linktype'])) 
        response, err = url_request(url_o,'json')
        
        if nf_dict['init']['dataform'] == 'dframe':
            
            pd.set_option('display.notebook_repr_html', True)
            pd.set_option('display.max_colwidth', -1)
            pd.set_option('display.max_rows', 500)
            colwidth = 100 # maximum column width in final request table
            maxrows = 200 # maximum datasets to show in final request table

    #   Main navigation links of the ERDDAP server
            df = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
            df['URL'] = df['URL'].str.replace('.json', nf_dict['init']['linktype'])
            sys.stdout.write(df.to_string(index=False, justify='left') + '\n')

    #   All datasets served by the ERDDAP server
            url = ''.join(df.loc[df['Resource'] == 'info']['URL'])
            sys.stdout.write("\nLink to all datasets with metadata: %s\n" % url) 

    #   All datasets served by the ERDDAP server
            url = "".join((nf_dict['init']['baseurl'], "/tabledap/allDatasets", nf_dict['init']['linktype']))
            sys.stdout.write("\nLink to all tabes, includes searchable information: %s\n\n" % url) 

    #   Adjust links,settings
            pd.set_option('display.max_colwidth',250)
            df['URL'] = df['URL'].str.replace(nf_dict['init']['linktype'],'.json')

    #   Set up info for search types (option, task, url)
            bulk_opts_dict = bulkoption(df)

    # else use full json method NOT DEVELOPED, incomplete **
        else: 
            sys.stdout.write("{}{}".format('Invalid entry, defaulting to .json format', '\n')) 
            pprint.pprint(response.keys())
            pp = pprint.PrettyPrinter(indent=2)
            sys.exit()
    #       add other functions later 

    # develop request table
        df_request, gen_url_html, gen_url_json = search_url(df, bulk_opts_dict, nf_dict['init']['baseurl'] )
        SB_status[0]['general information'][4]['data search url']  =  gen_url_html
        SB_status[0]['general information'][5]['data search json'] =  gen_url_json
    else:
        sys.stdout.write("\n{}{}{}\n\n".format('\n*** Automated search using information file: ', 
                         nf_dict['init']['sb_json'],'  ***'))                
        response, err  = url_request(SB_status[0]['general information'][5]['data search json'],'json')
        if err == 'failed': 
            sys.stdout.write("Identified problem using previous search url (*.json)\nExiting program")
            sys.exit()
        else:
            df_request = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
    return df_request
    

# Download files - data chunking option, uses content disposition, tries <=5 times then bails   
def download_file(url, fpath, altname, chunk):  
    attempts = 0
    while attempts < 5:
        try:
            response, err = url_request(url,'file')                 
        except:
            attempts += 1
            sys.stdout.write("\n\tRequest error - url: %s , on attempt: %d" % (url, attempts))
        else:
            break
    else:
        sys.stdout.write("\n\tRequest failed")
        return
    if response.status_code == 200: 
        try:
            d = response.headers['content-disposition']
        except:
            d = []
            sys.stdout.write("\n\tGeneric filename using disposition:\n\t %s" % url)
        if d != []:
            locf = ''.join(re.findall("filename=(.+)", d))
        else:
            locf = altname 
        path = "{}/{}".format(fpath, locf)
    
    # Stream file object, no data chunking
        if chunk == "OFF":
            with open(path, 'wb') as f:
                response.raw.decode_content = True
                copyfileobj(response.raw, f)
                sys.stdout.write("\n\tDownloaded %s " % locf)
                response.close()
                return path    
    # Chunk data - adjust iter_content size (bytes) ** -- not tested
        else:
            with open(path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
                sys.stdout.write("\n\tDownloaded %s " % locf)
                response.close()
                return path
    else:
        sys.stdout.write("\n\t%s returned request error status code: %s" % (altname, response.status_code)) 
        response.close()
        return "None"
    sys.stdout.write("\nRequest error status code: %s" % r.status_code)
    
     
# Attempt retrieval of data files, metadata, and misc information
# check information existence, purge (i.e. update) data files if commanded
def data_proc(status, df_request, nf_dict):
    
    if nf_dict['init']['purge_proc'] == 'skip':
        sys.stdout.write("\n\t - passed over data processing step - as directed\n")
        return 
            
    # loop through datasets
    for index, row in df_request.iterrows():
        
    # ensure existence of working directory (temporary?) 
        fpath = "{}{}".format(status[1]['local_directory'], row["Dataset ID"])
        chk_files(fpath, {}, '--')
        
    # if warranted create or modify dataset download record (ERDDAP dataset ID) 
        if row['Dataset ID'] not in status[2]['datasets (id)']:
            status[2]['datasets (id)'][row['Dataset ID']] = OrderedDict([('sb_id',"None"), ('dataset_url', "None"), 
                    ('download','incomplete'), ('upload','incomplete'), ('files', OrderedDict())])
            status[2]['datasets (id)'][row['Dataset ID']]['files'] = { k: "None" for k in (
                    nf_dict['init']['dataproc'] + nf_dict['init']['metafiles'] + ['info_dict','info_request'])}  
  
    # find ERDDAP dataset type (table, grid, wms), assumes type entries are unique 
        url_base = ''.join([row["tabledap"], row["griddap"], row["wms"]])
        status[2]['datasets (id)'][row['Dataset ID']]['dataset_url'] = url_base + ".html"

    # retrieve requested record information 
        if status[2]['datasets (id)'][row['Dataset ID']]['download'] != 'incomplete':
            sys.stdout.write("\n\n** Dataset ** : %s\n\n  files downloaded:" % row['Dataset ID'])
        else:
            sys.stdout.write("\n** Processing Dataset ** : %s - %s\n\n**  Retrieving datafiles:"
                % (row["Dataset ID"], row["Title"]))

        # retrieve requested ERDDAP datasets
            for d in nf_dict['init']['dataproc']:
                if status[2]['datasets (id)'][row['Dataset ID']]['files'][d] == "None":
                    url = ''.join("{}{}".format(url_base, d))
                    altname = ''.join("{}{}".format(row["Dataset ID"], d))
                    f = download_file(url,fpath, altname, "OFF") 
                    status[2]['datasets (id)'][row['Dataset ID']]['files'][d] = f
            sys.stdout.write("\n\tStep completed...\n")
            
            sys.stdout.write("\n    Processing metafiles:")
            for d in nf_dict['init']['metafiles']:
                if status[2]['datasets (id)'][row['Dataset ID']]['files'][d] == "None":
                    url = ''.join("{}{}".format(url_base, d))
                    altname = ''.join("{}{}".format(row["Dataset ID"], d)) 
                    f = download_file(url,fpath, altname, "OFF") 
                    status[2]['datasets (id)'][row['Dataset ID']]['files'][d] = f
            sys.stdout.write("\n\tStep completed...\n")
            
            with open(status[1]['local_directory'] + nf_dict['init']['sb_json'], 'w') as fp:
                json.dump(status, fp, indent=4)
                
        # process data table information 
            sys.stdout.write("\n    Processing table information:")

        # info dictionary 
            info_request = {}
            info_dict = {'summary_info': {},'data_info' : {} }
            
        # retrieve other ERDDAP information as dictionary (info_request, RSS, or basic xml, etc.)
            if status[2]['datasets (id)'][row['Dataset ID']]['files']['info_request'] == "None":
                for d in nf_dict['init']['table_info']:
                    entry = row[d]
                    
                # store info depending on the type, a work in progress
                    if "http" in entry and d != 'Summary':
                        ext = entry.rsplit(".",1)[1]
                        sys.stdout.write("\n\tRetrieving additional url to request: %s " % d)
                        altname = ''.join("{}{}".format(row["Dataset ID"], d))
                        
                    # info file
                        if d == "Info":
                            info_request, err = url_request(entry,'json')
                            if err != 'error':
                                f = "{}{}".format(fpath,"/info_request.json")
                                with open(f, 'w') as fp:
                                    json.dump(info_request, fp, indent=4)
                                status[2]['datasets (id)'][row['Dataset ID']]['files']['info_request'] = f
                        
                    # parse rss file
                        elif d == 'RSS': 
                            response, err = url_request(entry,'file')  
                            if response.status_code == 200:
                                response.raw.decode_content = True
                                tree = etree.parse(response.raw)
                                root = tree.getroot()
                                label = root.tag.rsplit("}",1)[1]
                                ns = {label: root.nsmap[None]}
                                f = "{}{}{}{}".format("//",label,":", 'item/*')
                                modinfo = tree.xpath(f, namespaces=ns)
                                info_dict['summary_info']['rss'] = {}
                                for r in modinfo:
                                    info_dict['summary_info']['rss'][r.tag.rsplit("}",1)[1]] = r.text 
                                f= '//rss:pubDate'
                                modinfo = tree.xpath(f, namespaces=ns)
                                for r in modinfo:
                                    info_dict['summary_info']['rss']['pubDate'] = r.text
                            response.close()
                        
                    # convert general xml request to dictionary
                        elif ext == "xml":  
                            response, err = url_request(entry,'file')  
                            if response.status_code == 200:
                                response.raw.decode_content = True
                                tree = etree.parse(response.raw)
                                xml_string = etree.tostring(tree)
                                info_dict['data_info'][d] = xml_to_dict(xml_string)
                            response.close()
                        
                    # dump misc content as text
                        else:
                            sys.stdout.write('file type: %s is not detailed.'
                                             '\n\tBulk printing response as string' % ext ) 
                            info_dict['data_info'][d] = entry
                    
                # store other in general section ('summary_info')
                    else:
                        info_dict['summary_info'][d] = entry
            
            # update file information
                f = "{}{}".format(fpath, "/info_dict.json")
                with open(f, 'w') as fp:
                    json.dump(info_dict, fp, indent=4)
                status[2]['datasets (id)'][row['Dataset ID']]['files']['info_dict'] = f
            
        # check if downloads complete
            bstat = status[2]['datasets (id)'][row['Dataset ID']]['files']
            if "None" not in bstat.itervalues():   
                status[2]['datasets (id)'][row['Dataset ID']]['download'] = "YES"
            else:
                status[2]['datasets (id)'][row['Dataset ID']]['download'] = "incomplete"
            with open(status[1]['local_directory'] + nf_dict['init']['sb_json'], 'w') as fp:
                json.dump(status, fp, indent=4)
                
        sys.stdout.write("\n\tStep completed...\n")         
        
    # work with only one dataset = for testing purposes Tristan      
    #    break
    return

# Search ERDDAP datasets using a simple interface (otherwise use adv url and bypass)  - incomplete ** 
def search_url(df, bulk_opts_dict, baseurl):
    
    while True:
        
        print("Request ERDDAP datasets using a retrieval option <letter>: \n\t%s\n\t%s" 
                  % ("(B) Bulk type constraint, or", "(S) Search (categorical, keyword(s), phrase)"))
        entry = (raw_input()).upper()
        
    # bulk file read 
        if entry == "B":  
            while True:
                sys.stdout.write("\nRequest all: (D) datasets, (T) Tables, or (G) Grids\n")
                bulk = (raw_input()).upper()
                if bulk in bulk_opts_dict:
                    link = ''.join(bulk_opts_dict[bulk][2]).replace(".json",".html")
                    print "\nBulk Request %s\nURL: %s" % ( bulk_opts_dict[bulk][1], link )
                    response, err = url_request(bulk_opts_dict[bulk][2],'json')
                    df_query = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
                    get_stats(df_query)
                    return df_query, link, bulk_opts_dict[bulk][2] 
                else:
                    print ("\nInvalid command - enter an indicated letter option (*)") 
    
    # Search by category, search word(s), and/or protocol (data type) 
        elif entry == "S":  
            sys.stdout.write("\n*** Note: custom search methods are loosely fitted - in progress ***")
            response, err = url_request(bulk_opts_dict["C"][2],'json')
            df_cquery = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
            df_cquery.columns.name = "Search Index"
            display(df_cquery)
            
        # Category search
            sys.stdout.write("Select search index for category [left column] (optional: # is not index --> skip):\n")
            try:
                cindx = int(raw_input())
            except:
                cindx = [] 
            squery_entry = ""
            dfl = list(df_cquery.index.values)
            if cindx in dfl:
                url = ''.join(df_cquery.iloc[[cindx]]["URL"])
                response, err = url_request(url,'json')
                df_squery = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
                df_squery.columns.name = "Search Index"
                display(df_squery)
                sindx = raw_input("Select search index for subcategory [left column] (optional: # is not index --> skip):\n")
                try:
                    sindx = int(sindx)
                except:
                    sindx = []
                dfl = list(df_squery.index.values)
                if sindx in dfl:
                    squery_entry = df_squery.get_value(sindx,"Category", takeable=False)
                    url = ''.join(df_squery.iloc[[sindx]]["URL"])
                    response, err = url_request(url,'json')
                    df_query = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])  
                else:
                    sys.stdout.write("\n\tEntered search index was not in list, skipped subcategory search.\n\n")       
            else:
                sys.stdout.write("\n\tEntered search index was not in list, skipped category refinement.\n\n")   

        # Word or phrase search 
            sys.stdout.write("\nOptional keyword or phrase search\nEnter space-delimited search word(s) or qouted phrase (blank --> skip):\n")
            search = raw_input()
            if search == '': 
                sys.stdout.write("Search input was blank, skipped word search")  
            
        # Protocol (data type) search constraint - currently does not examine subsets
            sys.stdout.write("\nDefine allowed protocol (data type): \n\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n" 
                  % ("(A) All data types,", "(G) Griddap, or", "(T) Tabledap, or","(W) Wms","(default --> All types)"))
            protocol = (raw_input()).upper()
            if protocol not in ["A", "G", "T", "W"]:
                protocol = "A"
            
        # Gather pertinent info, build custom url for http services
            comb = ''.join([search, protocol, squery_entry])
            if comb != "A":
                gen_url_html, gen_url_json = build_url(search, baseurl, protocol, df_cquery, cindx, squery_entry)   
                sys.stdout.write("\n'Advanced' search url:\n%s\n" % gen_url_html)
                try:
                    url_request(gen_url_json,'json')
                except:
                    sys.stdout.write("Identified exception during url request")
                    return
                df_request = pd.DataFrame(response['table']['rows'], columns=response['table']['columnNames'])
                get_stats(df_request)        
                return df_request, gen_url_html, gen_url_json
            else:
                sys.stdout.write("\nNo search constraints were detected - retry\n\n")  
        else:     
            print ("\nInvalid command - enter a letter option (*)\n") 

# set permissions
def set_permissions(item_id, acls):
    sb_base_url = "https://www.sciencebase.gov/catalog/item/" + item_id + "/permissions/"
    return sb._get_json(sb._session.put( sb_base_url, data=json.dumps(acls)))

# adjust read/write (task) privleges 
def set_acls(acls, names, task):
    if 'inheritsFromId' in acls[task]:
        del acls[task]['inheritsFromId']
    acls[task]['inherited'] = False
    new_acls = []
    for p in names:
        new_acls.append(p)
    acls[task]['acl'] = new_acls
    return acls

# reformat date entries
def date_reform(date_input):
    if date_input:
        date_rec = str(parser.parse(date_input, ignoretz=True)).split(" ",1)[0]
        if date_rec == datetime.date.today():
            date_rec = "{}{}".format(date_input,'_recheck') 
        return date_rec
    else:
        return ""
        
# pull info from ERDDAP info table as pandas dataframe (adhoc corrections for now, work in progress)
def populate_sbase(attribute , rmv, info_frame):
    indx = info_frame[info_frame['Attribute Name'] == attribute].index.tolist()
    if indx:
        entry = info_frame.get_value(indx[0],"Value", takeable=False)
        if rmv == 'yes':
            if isinstance(entry, basestring):
                entry = re.sub(r'\,(?! )', ', ', re.sub(r' +', ' ', entry)) # ensure space after commas
#            entry = re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', entry)) # ensure space after period
                entry = entry.rstrip() # remove carriage returns /n  
                entry = entry.replace('?s',"'s") # remove erroneous "?" unicode trans errors
        return entry
    else:
        return ""
    

# process contact information from erddap information file
def proc_erddap_contacts(base_key, info_file, type_label, info_keys, label_keys):     
    contact_dict = {}
    for i, k in enumerate(info_keys):
        srch_key = (base_key + '_' + k)
        if info_file['Attribute Name'].str.contains(srch_key).any():
            contact_dict[label_keys[i]] = populate_sbase(srch_key,'yes', info_file).decode('utf-8')
        else:
            contact_dict[label_keys[i]] = ""
    contact_dict[u'type'] = type_label
    if u'organization' in contact_dict:
        cpy = contact_dict[u'organization']
        contact_dict[u'organization'] = {u'displayText': cpy}
    
    return contact_dict

# purge function to restart data and metadata processing, if True 
# i.e. retrieved files will be re-processed and will overwrite existing files 
def purge_option(SB_status, nf_dict):
    if nf_dict['init']['purge_proc'] == True:
        sys.stdout.write("\n** Request to reset (purge) data/metadata status indicators")
        dataproc = nf_dict['init']['dataproc']
        metafiles = nf_dict['init']['metafiles']
        for dataset in SB_status[2]['datasets (id)']:
            SB_status[2]['datasets (id)'][dataset]['download'] = 'incomplete'
            SB_status[2]['datasets (id)'][dataset]['upload'] = 'incomplete'
            SB_status[2]['datasets (id)'][dataset]['files'] = { k: "None" for k in 
                (dataproc + metafiles + ['info_dict','info_request']) }
        with open(SB_status[1]['local_directory'] + nf_dict['init']['sb_json'], 'w') as fp:
            json.dump(SB_status, fp, indent=4)
        sys.stdout.write("\n\t - Reset (purge) complete")
    elif nf_dict['init']['purge_proc'] == False:
        sys.stdout.write("\n** Request to append incomplete file information to existing status file\n")
               
# Create and update SB records, modify as needed
def SciBase_item(status, nf_dict):
    
    if nf_dict['init']['purge_sb'] == 'skip':
        sys.stdout.write("\n\t - passed over ScienceBase processing step - as directed\n")
        return
    else:
        sys.stdout.write("{}".format('\n\n\tLogin to ScienceBase - '))
        sb = pysb.SbSession()
        sb.loginc(str(nf_dict['init']['login_name']))
        time.sleep(5)
    
    for dataset in status[2]['datasets (id)']:
        
        data = status[2]['datasets (id)'][dataset]
        
        if data["download"] != 'incomplete':

        # Open erddap information files (info_request and info_dict, as json)
            with open(data['files']['info_request']) as fp:    
                info_request = json.load(fp)
            info_frame = pd.DataFrame(info_request['table']['rows'], columns=info_request['table']['columnNames'])
            with open(data['files']['info_dict']) as fp:    
                info_dict = json.load(fp)
            sys.stdout.write("\n   Processing dataset: %s into ScienceBase\n" % dataset)

        # check SB item existence (visibility)
            chk = True
            if data["sb_id"] != "None":
                try:
                    SB_rec = sb.get_item(data["sb_id"])
                except:
                    sys.stdout.write("\n{}{}{}".format('Warning: SB item ', data["sb_id"],' was NOT located - creating new item'))
                    chk = False
                else:
                    sys.stdout.write("\n\t{}{}{}".format('SB item: ', data["sb_id"],' was found in ScienceBase'))
            else:
                chk = False
            
        # create new sciencebase item, if an existing item is not found 
            if chk == False:
                new_item_info = {'title': populate_sbase("title", 'yes', info_frame),
                    'parentId': sb.get_my_items_id(),
                    'provenance': {'annotation': nf_dict['prov_state']}}
                SB_rec = sb.create_item(new_item_info)
                data["sb_id"] = SB_rec['id']
                status[2]['datasets (id)'][dataset]["sb_id"] = SB_rec['id']
                sys.stdout.write("\n\t{}{}".format('Created new ScienceBase item:', data["sb_id"]))
            
        # check SB item permissions, if no permissions create new item
            if u'USER:' + nf_dict['init']['login_name'] in SB_rec['permissions'][u'write'][u'acl']:
                sys.stdout.write("\n\t** Verified ** user has write privileges for item")
            else:
                sys.stdout.write("\n *****  WARNING  ***** : user is without write privileges - creating new item")
                data["sb_id"] = "None"
                new_item_info = {'title': populate_sbase("title",'yes', info_frame),
                    'parentId': sb.get_my_items_id(),
                    'provenance': {'annotation': nf_dict['prov_state']}}
                SB_rec = sb.create_item(new_item_info)

            sys.stdout.write("\n\tChecking files, uploading/updating files and metadata")
            
        # gather list of files currently in item's 'files' and 'facets'
            uploaded_files = []
            if 'files' in SB_rec:
                uploaded_files = [f_ex['name'] for f_ex in SB_rec['files']]  
            if 'facets' in SB_rec:
                for facet in SB_rec['facets']:
                    if 'files' in facet:
                        for f in facet['files']:
                            uploaded_files.append(f['name'])
                        
        # upload only new files 
            if nf_dict['init']['purge_sb'] == False:
                for f in (nf_dict['init']['metafiles'] + nf_dict['init']['dataproc']):
                    fn = data['files'][f].rsplit( "/", 1 )[ 1 ] 
                    if fn not in uploaded_files:
                        sys.stdout.write("\n\t{}".format("uploading new file to item"))
                        sb.uploadFileToItem(SB_rec, data['files'][f])  
        # else upload/update all files
            elif nf_dict['init']['purge_sb'] == True:
                for f in (nf_dict['init']['metafiles'] + nf_dict['init']['dataproc']):
                    fn = data['files'][f].rsplit( "/", 1 )[ 1 ] 
                    if fn not in uploaded_files:
                        sys.stdout.write("\n\t{}".format("uploading new file to item"))
                        sb.uploadFileToItem(SB_rec, data['files'][f])
                    else:
                        sys.stdout.write("\n\t{}".format("updating existing file in item"))
                        sb.replace_file(data['files'][f], SB_rec)
                        SB_rec = sb.get_item(data["sb_id"])
                   
            sys.stdout.write("\n\tModifying ScienceBase record information")        
                    
        # retrieve latest SB item after file uploads/updates 
            SB_rec = sb.get_item(data["sb_id"])
            
        # sb date tags, create if needed. 
            rep = status[0]['general information'][2]['file modified (date-time)']
            if rep == 'none' or 'dates' not in SB_rec:                   
                SB_rec[u'dates']  = [{u"type":u"Item submission",
                    u"dateString":str(datetime.date.today()), u"label": u"Item completed"}]       
            elif SB_rec[u'dates'][0][u"label"] == u"Item submission":
                SB_rec[u'dates'] = [SB_rec[u'dates'][0]]
            else:
                SB_rec[u'dates'] = []
                
        # add in erddap record dates, if present   
            for i, k in enumerate(nf_dict['date_keys'] ):
                entry = date_reform(populate_sbase(k,'no', info_frame))
                if entry:
                    SB_rec[u'dates'].append({
                        u"type": nf_dict['date_labels'][i],
                        u"dateString":entry,
                        u"label": nf_dict['date_labels'][i] })
                    
        # add in file retrieval (download) date using OS call 
            if nf_dict['init']['metafiles'][0]:
                fn = nf_dict['init']['metafiles'][0]
                date_input = path.getmtime(data['files'][fn])
            else:
                fn = nf_dict['init']['dataproc'][0]
                date_input = path.getmtime(data['files'][fn])  
            Int2date = datetime.datetime.fromtimestamp(date_input)
            retrieve = datetime.datetime.strftime(Int2date, '%Y-%m-%d')
            SB_rec[u'dates'].append( {u"type":u"Retrieved from source",
                u"dateString":retrieve.decode('utf-8'),
                u"label":u"Retrieved from source"} )
                                 
        # update citation information, if available, else remove tag
            study_cite = populate_sbase('associatedReferences','yes', info_frame)
            data_cite = populate_sbase('bibliographicCitation','yes', info_frame)
            if data_cite:
                SB_rec['citation'] = "{}{}".format("Data citation - ", data_cite)
                if study_cite:
                    SB_rec['citation'] =  "{}{}".format(SB_rec['citation'],  
                        "{}{}".format(". Study citation - ", study_cite))
            elif study_cite:
                SB_rec['citation'] = "{}{}".format("Study citation - ", study_cite)
            elif 'citation' in SB_rec:
                SB_rec['citation'] = ""
                
        # update provenance tag
            SB_rec[u'provenance']['annotation'] = "{}{}{}".format(nf_dict['prov_state'],' DataID: ', dataset)
   
        # update title
            SB_rec["title"] = populate_sbase("title",'yes', info_frame)

        # retrieve license information
            entry_license = populate_sbase("license",'yes', info_frame)

        # update summary + add license information
            SB_rec["body"] = "{}{}{}".format(populate_sbase("summary",'yes', info_frame),'&nbsp; &nbsp;\n<br> \n<br>',entry_license)

        # provide OPeNDAP weblinks to source
            webrec = []
            for w in nf_dict['init']['webnames']:
                d_url = status[2]['datasets (id)'][dataset]["dataset_url"].replace('.html', w)
                rec = {
                    u'hidden': False,
                    u'rel': u'related',
                    u'title': nf_dict['file_dict'][w],
                    u'type': u'OPeNDAP weblinks - data provenance trace',
                    u'uri': d_url} 
                webrec.append(rec)
            SB_rec['webLinks'] = webrec 
            
        # retrieve contact information in erddap info record
            url_b = "{}{}{}{}".format(nf_dict['init']['baseurl'],'/info/', dataset,'/index.json')
            info_req = requests.get(url_b).json()
            info_file = pd.DataFrame(info_req['table']['rows'], columns=info_req['table']['columnNames'])     
            
        # search for contacts, 
        # if none, allow autopopulated metadata contacts, if present
            store_name = []
            for indx,tag in enumerate(nf_dict['erddap_contype']):
                label = "{}{}".format(tag,'_name')
                if info_file['Attribute Name'].str.contains(label).any():
                     store_name.append(proc_erddap_contacts(tag, info_file, nf_dict['sb_contype'][indx].decode('utf-8'),
                        nf_dict['info_keys'], nf_dict['label_keys']))
            if not store_name:
                store_name = SB_rec['contacts']     
            store_name.append(nf_dict['BCB_contact'])
l

        # eliminate duplicate contacts
            seen = set()
            SB_rec['contacts'] = [item for item in store_name 
                                  if item['name'] not in seen and not seen.add(item['name'])]
        # set permissions 
            acls = SB_rec['permissions']
            set_acls(acls, nf_dict['read_names'],'read')
            set_acls(acls, nf_dict['write_names'],'write')
            #   set_permissions(SB_rec['id'], acls) - currently not used (sub 2 lines below)
            sb_base_url = "https://www.sciencebase.gov/catalog/item/" + SB_rec['id'] + "/permissions/"
            sb._get_json(sb._session.put( sb_base_url, data=json.dumps(acls)))

        # remove redundancy in tag names     
            seen = set()
            SB_rec[u'tags'] = [item for item in SB_rec[u'tags'] 
                                  if item['name'] not in seen and not seen.add(item['name'])]
                
        # check for potential duplicate items in sciencebase (title, filename(s), dataset id)
            sys.stdout.write("\n\t{}{}".format('Searching for duplicate', 
                ' sb items using filenames, item title, and dataset ID'))  
            blk_search = sb.find_items_by_title(SB_rec["title"])[u'items']
            if 'files' in SB_rec:
                for f in SB_rec['files']:
                    blk_search.extend(sb.find_items_by_any_text(f['name'])[u'items'])        
            blk_search.extend(sb.find_items_by_any_text(dataset)[u'items'])  
            seen = set()
            search_id = [item['id'] for item in blk_search 
                        if item['id'] not in seen and not seen.add(item['id'])]
            search_id = filter(lambda a: a != data["sb_id"], search_id )            
            if search_id:
                sys.stdout.write("\n\n\t\t{}".format('*** WARNING possible duplicate item(s) ***'))
                status[2]['datasets (id)'][dataset]['Possible replicate items'] = search_id
                for s in search_id:
                    sys.stdout.write("\n\t\t\t{}".format(s))
            else:
                sys.stdout.write("\n\t\t{}".format('pass: item duplicates not identified'))

        # save updates, file
            sys.stdout.write("\n\tScienceBase processing completed\n")
            sb.updateSbItem(SB_rec)
            status[2]['datasets (id)'][dataset]["upload"] = "YES"
            status[2]['datasets (id)'][dataset]["sb_id"] = data["sb_id"]
            with open(status[1]['local_directory'] + nf_dict['init']['sb_json'],'w') as fp:
                json.dump(status, fp, indent=4)
                
        # testing - work with only one dataset = for testing purposes (break) - Tristan       
        #    break 
        else:
            sys.stdout.write("\nInfo for dataset %s - incomplete not processed\n" % dataset)  

print('definitions loaded')

definitions loaded


In [61]:
# !/usr/bin/python

# *******************************************************
#                   Main Program: 
# *******************************************************

def main(): 
    
#  read general input file
    with open(tempdir + "name_file_dict.json" ) as fp:    
        nf_dict = json.load(fp)

#  task timer
    tstamp = []
    tasklist = []
    tasklist.append('Preliminary processes and testing')

#  check preliminary info, search and retrieve ERDDAP data information
    tstamp.append(timer())
    sys.stdout.write('Search ERDDAP data repository, develop search criteria\n') 
    SB_status = chk_files(tempdir, nf_dict, 'initialize')
    df_request = retrieve_data(nf_dict, SB_status)

#  reset data and metadata processing status in status file, if directed
    purge_option(SB_status, nf_dict)
    
#  process data
    tstamp.append(timer())
    tasklist.append('Process datasets and information')
    sys.stdout.write("{}".format('\n\n** Processing data and metadata files\n'))
    data_proc(SB_status, df_request, nf_dict)

#  create, modify sciencebase records 
    tstamp.append(timer())
    tasklist.append('Develop and/or modify ScienceBase records')
    sys.stdout.write("{}".format('\n** Processing ScienceBase items **'))
    SciBase_item(SB_status, nf_dict)
          
#  create date_stamps 
    tstamp.append(timer())
    tasklist.append('Archive status and dictionary files with date stamps')
    dt = datetime.datetime.now()
    timesec = int(dt.strftime('%s%f'))
    today = datetime.datetime.strftime(dt,'%Y-%m-%d')
    partial_uid = SB_status[0]['general information'][10]['processing uuid'].rsplit("-",1)[1] 
    unique = 'uuid_' + partial_uid + '_timeid_' + str(timesec)
    fulldate = datetime.datetime.strftime(dt,'%Y-%m-%d %H:%M:%S')
    
#  save final version of status file
    SB_status[0]['general information'][2]['file modified (date-time)'] = fulldate
    fname = "{}{}".format(SB_status[1]['local_directory'], nf_dict['init']['sb_json'])
    with open(fname, 'w') as fp:
        json.dump(SB_status, fp, indent=4)
        
#  create prov copies of status file and dictionary file, place in prov folder       
    fpath = "{}{}".format(SB_status[1]['local_directory'], 'Prov_files')
    chk_files(fpath, {}, '--')
    fname = "{}{}{}{}".format(fpath, '/', unique, '_status.json')
    with open(fname, 'w') as fp:
        json.dump(SB_status, fp, indent=4)
    fname = "{}{}{}{}".format(fpath, '/', unique, '_nfdict.json')
    with open(fname, 'w') as fp: 
        json.dump(nf_dict, fp, indent=4)
    
#  timer results
    tstamp.append(timer())
    for i in range(len(tstamp)-1):
        sys.stdout.write('\nTask '+ str(i+1) + ': ' + tasklist[i] + ' | Runtime [secs]: '+ str(round(tstamp[i+1]-tstamp[i],3)))
    
    sys.stdout.write("\n\nProgram has finished")

    
# ************************************************************************* 
    
if __name__ == '__main__':
    
# call to main                                                 
    main() 


Search ERDDAP data repository, develop search criteria


*** Automated search using information file: file_sb.json  ***


** Request to append incomplete file information to existing status file


** Processing data and metadata files


** Dataset ** : erdCinpKfmSFNH

  files downloaded:
	Step completed...


** Dataset ** : erdCinpKfm1Q

  files downloaded:
	Step completed...


** Dataset ** : erdCinpKfm5Q

  files downloaded:
	Step completed...


** Dataset ** : erdCinpKfmBT

  files downloaded:
	Step completed...


** Dataset ** : erdCinpKfmFT

  files downloaded:
	Step completed...


** Dataset ** : erdCinpKfmRPC

  files downloaded:
	Step completed...

** Processing Dataset ** : prboSefiDiet - Farallon Island Seabird Diet Summary

**  Retrieving datafiles:
	Step completed...

    Processing metafiles:
	prboSefiDiet.iso19115 returned request error status code: 500
	Step completed...

    Processing table information:
	Step completed...

** Processing Dataset ** : prboSefiPhen - Fara