This notebook takes input a csv file and loops through a list of DOIs re-formating the author names to the proper format: `last, first MI`. The parsed names are validate against the Pubs Warehouse publications web-service. If names are not found in PubsWarehouse, it will try finding them in Peter's Swheitzer's web-service. There is a third option, not yet implemented where, if a name is not found with the previous two web-services, sciencebase people web-service can be the next tool to use. (TODO)


In [None]:
import os
import sys
import json
import requests
import getpass
from IPython.display import display

sys.path.append('/Users/v1g/Projects/usgs_datatools')

from usgs_datatools import doi


In [None]:
username = "admin@usgs.gov"
password = "admin"

In [None]:
doi_session = doi.DoiSession(env='local')
doi_session.doi_authenticate(username, password)

In [None]:
'''
This method takes two lists that contain the full name and required parameters for Pubs Warehouse web-service.
    It parses the json response looking for author names that match both the first and last name.
    Returns the pubs warehouse name and orcid if a match is found.
'''

def search_pubs_warehouse(name, params):
    proper_name = {}
    PUB_URL = "https://pubs.er.usgs.gov/pubs-services/publication/"
    PUB_PARAMS = {'format': 'json', 'contributor': params}
    pub_request = requests.get(url = PUB_URL, params=PUB_PARAMS)
    print pub_request.url
    data = pub_request.json()
    for record in data['records']:
        if 'authors' in record['contributors']:
            for author in record['contributors']['authors']:
                if  name[0] in author['text'] and name[1] in author['text']:
                    proper_name['authorName'] = author['family'] + ", " + author['given']
                    if 'orcid' in author:
                        orcid = author['orcid'].split("/")[-1:]
                        proper_name['orcId'] = orcid[0]
                    return proper_name
    return proper_name


In [None]:
'''
This method takes two lists that contain the full name and required parameters for Peter's web-service.
    It parses the json response looking for author names that match both the first and last name.
    Returns the author name and orcid if a match is found.
'''

def search_peter_service(name, params):
    proper_name = {}
    PETER_URL = "https://geo-nsdi.er.usgs.gov/contacts.php"
    PETER_PARAMS = [{'format': 'json', 'givenname': params[0], 'sn': params[1]},
                    {'format': 'json', 'givenname': params[1], 'sn': params[0]}]
    for param_list in PETER_PARAMS:
        peter_request = requests.get(url = PETER_URL, params=param_list)
        print peter_request.url
        try:
            data = peter_request.json()
        except:
            continue
        for contact in data['contacts']:
            if  name[0] in contact['cntperp']['cntper'] and name[1] in contact['cntperp']['cntper']:
                proper_name['authorName'] = contact['cntperp']['name']['last'] + ", " + \
                                            contact['cntperp']['name']['first'] + " " + \
                                            contact['cntperp']['name']['middle']
                if 'onlink' in contact:
                    orcid = contact['onlink'].split("/")[-1:]
                    proper_name['orcId'] = orcid[0]
                return proper_name
            
    return proper_name


In [None]:
'''
This method takes the full name as a list and runs a few validation checks before calling the web-services.
    Returns a valid author name.
'''

def validate(name):
    initials = []
    name_params = []
    proper_name = {}
    stripped_name = []
    name_strings = re.findall(r'\w+', name)

    if len(name_strings) < 2:
        print "WARNING !!!!! NAME CONTAINS ONLY 1 STRNG!!!!!"
        proper_name['authorName'] = name
        proper_name['nameType'] = "Personal"
        return proper_name

    #strip the initials
    for string in name_strings:
        if len(string) > 1:
            stripped_name.append(string)
            name_params.append(string)
        else:
            initials.append(string)

    if len(stripped_name) > 3:
        print "WARNING !!!!! Name too long. Possible organization"
        proper_name['authorName'] = name
        proper_name['nameType'] = "Organizational"
        return proper_name

    
    if len(stripped_name) < 2:
        print "WARNING !!!!! Name must have at least 2 strings: family name and given name!!!!!"
        proper_name['authorName'] = name
        proper_name['nameType'] = "Personal"
        return proper_name

        
    proper_name = search_pubs_warehouse(stripped_name, name_params)
    if not proper_name:
        proper_name = search_peter_service(stripped_name, name_params)
    #if not proper_name:
        #proper_name = search_sciencebase(stripped_name, name_params)
    if not proper_name:
        proper_name['authorName'] = name
        
    proper_name['nameType'] = "Personal"
    return proper_name


In [None]:
'''
This method takes a DOI identifier, retrieves the full json record from DMAPI, and replaces the improper author
    entries with proper formated authors.
    The return is a DOI object
'''

import re
import requests

def get_proper_author_list(doi):
    proper_author_list = []
    doi_json = doi_session.get_doi(doi)
    
    #add IPDS fields
    doi_json['noPublicationIdAvailable'] = True
    doi_json['noDataReleaseAvailableReason'] = 'LEGACY_DATA'
    
    #decode text fields
    if doi_json['description']:
        description = doi_json['description'].encode('ascii','replace')
        doi_json['description'] = description
    if doi_json['title']:
        title = doi_json['title'].encode('ascii','replace')
        doi_json['title'] = title

    #if date or dateType are null, make both null
    if not doi_json['date']:
        doi_json['dateType'] = ''
    if not doi_json['dateType']:
        doi_json['date'] = ''

    index = 0
    for authors_entry in doi_json['authors']:
        print "IMPROPER NAME: " + authors_entry['authorName']
        author_names = authors_entry['authorName'].split(';')
        print "SIZE: " + str(len(author_names))

        for name in author_names:
            proper_name = validate(name)
            if not proper_name['authorName']:
                continue
            proper_name['position'] = index
            proper_author_list.append(proper_name)
            index += 1
            
    doi_json['authors'] = proper_author_list

    return doi_json

In [None]:
'''
This code retrieves a list of DOI identifiers from a csv file and for each DOI will try to format the author list 
    and then submit an update request to DMAPI to save the new record to database and Datacite.
    Failed POSTs will append the DOI identifier to a local txt file.
'''

import csv

failed_list = []

with open('/Users/v1g/Desktop/remove/pattern_semicolon.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for doi in csv_reader:
        print "\n\n" + doi[0]
        new_payload = get_proper_author_list(doi[0])
        update_req = doi_session.doi_update(new_payload)
        error_msgs = ["Misformatted Date", "dateType"]
        if any(x in update_req['message'] for x in error_msgs):
            new_payload['date'] = ''
            new_payload['dateType'] = ''
            update_req = doi_session.doi_update(new_payload)
        if any(x in update_req['message'] for x in error_msgs):
            new_payload['date'] = ''
            new_payload['dateType'] = ''
            update_req = doi_session.doi_update(new_payload)
        print "\n\n"
        print new_payload['authors']
        print update_req
        
        if update_req['error'] > 202:
            failed_list.append(doi[0])

print failed_list
with open('/Users/v1g/Desktop/remove/failed_list.txt', 'w') as txt_file:
    for row in failed_list:
        txt_file.write(row + "\n")
