This notebook provides an example of running through and making a bunch of DOI updates based on records in ScienceBase. In this case, we had previously reserved DOIs, all pointing at ScienceBase Items as their de-referencing URLs, and we now need to finalize the DOIs and turn them on for real. It works with Brandon Serna's usgs_datatools package, where the latest version works with the new DOI REST API. Most of the parts and pieces of this should be fairly easily reused by others needing to do something similar.

The first few blocks here do all the same things Brandon has shown in other examples, setting up a session with the DOI tool to do work.

In [None]:
import os
import sys
import json
import requests
import getpass

from IPython.display import display
from usgs_datatools import doi
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError

pubs_adapter = HTTPAdapter(max_retries=3)
pubs_session = requests.Session()
pubs_session.mount('https://pubs.er.usgs.gov', pubs_adapter)

peter_adapter = HTTPAdapter(max_retries=3)
peter_session = requests.Session()
peter_session.mount('https://geo-nsdi.er.usgs.gov', peter_adapter)


In [None]:
username = "vgarbulet@contractor.usgs.gov"
password = "password"

In [None]:
doi_session = doi.DoiSession(env='beta')
doi_session.doi_authenticate(username, password)

In [None]:
def search_pubs_warehouse(name, params):
    proper_name = {}
    PUB_URL = "https://pubs.er.usgs.gov/pubs-services/publication/"
    PUB_PARAMS = {'format': 'json', 'contributor': params}
    try:
        pub_request = pubs_session.get(url = PUB_URL, params=PUB_PARAMS)
    except ConnectionError as ce:
        print(ce)
        exit(1)
            
    #print(pub_request.url)
    data = pub_request.json()
    for record in data['records']:
        if 'authors' in record['contributors']:
            for author in record['contributors']['authors']:
                if  name[0] in author['text'] and name[1] in author['text']:
                    if 'family' in author and 'given' in author:
                        proper_name['authorName'] = author['family'] + ", " + author['given']
                    else:
                        proper_name['authorName'] = name
                    if 'orcid' in author:
                        orcid = author['orcid'].split("/")[-1:]
                        proper_name['orcId'] = orcid[0]
                    return proper_name
    
    return proper_name


In [None]:
def search_peter_service(name, params):
    proper_name = {}
    PETER_URL = "https://geo-nsdi.er.usgs.gov/contacts.php"
    PETER_PARAMS = [{'format': 'json', 'givenname': params[0], 'sn': params[1]},
                    {'format': 'json', 'givenname': params[1], 'sn': params[0]}]
    for param_list in PETER_PARAMS:
        try:
            peter_request = peter_session.get(url = PETER_URL, params=param_list)
        except ConnectionError as ce:
            print(ce)
            exit(1)
        try:
            data = peter_request.json()
        except:
            continue
        for contact in data['contacts']:
            if  name[0] in contact['cntperp']['cntper'] and name[1] in contact['cntperp']['cntper']:
                proper_name['authorName'] = contact['cntperp']['name']['last'] + ", " + \
                                            contact['cntperp']['name']['first'] + " " + \
                                            contact['cntperp']['name']['middle']
                if 'onlink' in contact:
                    orcid = contact['onlink'].split("/")[-1:]
                    proper_name['orcId'] = orcid[0]
                return proper_name
            
    return proper_name


In [None]:
def validate(name):
    initials = []
    name_params = []
    proper_name = {}
    stripped_name = []
    name_strings = re.findall(r'\w+', name)

    if len(name_strings) < 2:
        print("WARNING !!!!! NAME CONTAINS ONLY 1 STRNG!!!!!")
        proper_name['authorName'] = name
        proper_name['nameType'] = "Personal"
        return proper_name

    #strip the initials
    for string in name_strings:
        if len(string) > 1:
            stripped_name.append(string.strip())
            name_params.append(string.strip())
        else:
            initials.append(string.strip())

    if len(stripped_name) > 3:
        print("WARNING !!!!! Name too long. Possible organization")
        proper_name['authorName'] = name
        proper_name['nameType'] = "Organizational"
        return proper_name

    
    if len(stripped_name) < 2:
        print("WARNING !!!!! Name must have at least 2 strings: family name and given name!!!!!")
        proper_name['authorName'] = name
        proper_name['nameType'] = "Personal"
        return proper_name

        
    proper_name = search_pubs_warehouse(stripped_name, name_params)
    if not proper_name:
        proper_name = search_peter_service(stripped_name, name_params)
    #if not proper_name:
        #proper_name = search_sciencebase(stripped_name, name_params)
    if not proper_name:
        proper_name['authorName'] = name
        
    proper_name['nameType'] = "Personal"
    return proper_name


In [None]:
import re
import requests

def get_proper_author_list(doi):
    proper_author_list = []
    doi_json = doi_session.get_doi(doi)
    #print(doi_json)
    
    # Return 0 if GET request failed
    if 'error' in doi_json:
        return 0
    
    #add IPDS fields
    if len(doi_json['ipdsNumbers']) < 1:
            doi_json['noDataReleaseAvailableReason'] = 'LEGACY_DATA'

    doi_json['noPublicationIdAvailable'] = True

    #decode text fields
    if doi_json['description']:
        description = str(doi_json['description']).encode('ascii','replace')
        doi_json['description'] = description.decode("utf-8")
    if doi_json['title']:
        title = str(doi_json['title']).encode('ascii','replace')
        doi_json['title'] = title.decode("utf-8")

    #if date or dateType are null, make both null
    
    index = 0
    for authors_entry in doi_json['authors']:
        print("IMPROPER NAME: " + authors_entry['authorName'])
        author_names = authors_entry['authorName'].split(';')
        print("SIZE: " + str(len(author_names)))

        for name in author_names:
            proper_name = validate(name.strip())
            if not proper_name['authorName']:
                continue
            proper_name['position'] = index
            proper_author_list.append(proper_name)
            index += 1
            
    doi_json['authors'] = proper_author_list

    return doi_json

In [None]:
import csv

success_file=open("success_report.txt", "a+")
failure_file=open("failure_report.txt", "a+")
success_list=open("success_list.txt", "a+")

with open('/Users/remoteuser/JupiterNotebooks/DMAPI/pattern_semicolon_small.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for doi in csv_reader:
        print("\n\n" + doi[0])
        new_payload = get_proper_author_list(doi[0])

        if new_payload == 0:
            failure_file.write(doi[0] + "\n" + json.dumps(new_payload) + " DOI NOT FOUND!!!!!!!!\n")
            continue
            
        print("UPDATES TO:")
        print(json.dumps(new_payload['authors'], indent=4))

        #answer = input('Do you want to update database with new author?(y/n)')
        answer = 'y'
        if answer == 'y':
            update_req = doi_session.doi_update(new_payload)
            #print(update_req)

            if "error" in update_req and update_req['error'] > 202:
                print("\nFAILURE:\n")
                failure_file.write(doi[0] + "\n")
                failure_file.write(json.dumps(update_req))
                failure_file.write("\n\n")
                continue

            else:
                print("\nSUCCESS:\n")
                success_list.write("\n\n" + doi[0])
                success_file.write("\n\n" + doi[0])
                success_file.write("\nUPDATES TO:\n" + json.dumps(new_payload['authors'], indent=4))
        
        else:
            failure_file.write(doi[0] + "\n")
            failure_file.write("UPDATES TO WRONG AUTHORS:\n" + json.dumps(new_payload['authors'], indent=4))
            
success_file.close()
failure_file.close()
txt_file.close()
success_list.close()
