# Author VIAF Download

This script will talk to viaf.org to find the VIAF for an authorized author name heading. 

It is a requirement that you have a full name heading from the MARC record such as field 100. You can add these from Hathi trust data running the `parse_hathi_add_auth_name` script

Since name string matching can be error prone if you have work titles in your dataset make sure to configure `use_title_reconcilation` and `title_column` to get better results by using the work title in reconciliation process.

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It creates a new column in the file `author_viaf` with the VIAF ID.

In [None]:
import pandas as pd
import requests
import time
import string
import unicodedata


## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`id_column_name` - the name of the column header that contains authorized author heading value

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call

`use_title_reconcilation` - boolean true/false to use a title to help reconcile non-exact matches, if you have a title in your dataset set this to true to get better results

`title_column` - the name of the column that has the title to use

In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"
id_column_name = "author_marc"
user_agent = 'YOUR PROJECT NAME HERE'
pause_between_req = 1

use_title_reconcilation = True
title_column = "shorttitle"

cache = {}

In [None]:
def add_viaf(d):

    if type(d[id_column_name]) != str:     
        # no heading to use skipp
        return d

    # if it has a LCCN then skipp
    if type(d['author_lccn']) == str:
        return d

        
    # if there is already a value skip it
    if 'author_viaf' in d:        
        if pd.isnull(d['author_viaf']) == False:        
            print('Skip',d[id_column_name],d['author_viaf'])
            return d


    name = d[id_column_name]

    # drop any trailing commas or periods
    if name[-1] == '.'  or name[-1] == ',':
        name = name[:-1]



    params = {
        'query' : f'local.personalNames = "{name}"',
        'maximumRecords': 10,
        'startRecord' : 1,
        'sortKeys': 'holdingscount',
        # 'httpAccept': 'text/xml'
        'httpAccept': 'application/json'
    }

    headers={'User-Agent': user_agent}
    url = "https://viaf.org/viaf/search"

    r = requests.get(url,params=params,headers=headers)

    try:
        data = r.json()
    except:
        print("JSON decode error with:",d[id_column_name])
        return d        

    # print(data)
    if data['searchRetrieveResponse']['numberOfRecords'] == '0' or data['searchRetrieveResponse']['numberOfRecords'] == 0:
        # no reuslts found
        return d

    for record in data['searchRetrieveResponse']['records']:

        if type(record['record']['recordData']['mainHeadings']['data']) != list:
            record['record']['recordData']['mainHeadings']['data'] = [record['record']['recordData']['mainHeadings']['data']]

        for mainHeadingName in record['record']['recordData']['mainHeadings']['data']:


            if normalize_string(mainHeadingName['text']) == normalize_string(name):
                # print("Exact match:", mainHeadingName['text'], '==',name)
                d['author_viaf'] = record['record']['recordData']['viafID']
                return d
            # else:
            #     print("XXXX bad match:", mainHeadingName['text'], '!=',name)
            

    # if it is a birth dates but no death dates then check for that partial match
    if normalize_string(name)[-4:].isdigit() == True and normalize_string(name)[-8:].isdigit() == False:
        
         for record in data['searchRetrieveResponse']['records']:
            if type(record['record']['recordData']['mainHeadings']['data']) != list:
                record['record']['recordData']['mainHeadings']['data'] = [record['record']['recordData']['mainHeadings']['data']]

            for mainHeadingName in record['record']['recordData']['mainHeadings']['data']:
                if normalize_string(mainHeadingName['text'])[:len(normalize_string(name))] == normalize_string(name):
                    d['author_viaf'] = record['record']['recordData']['viafID']
                    return d
                    
                
    
    # if no match still look at the titles for each, at viaf
    if use_title_reconcilation == True:

        for record in data['searchRetrieveResponse']['records']:
            if 'titles' in record['record']['recordData']:
                if record['record']['recordData']['titles'] != None:
                    if type(record['record']['recordData']['titles']['work']) != list:
                        record['record']['recordData']['titles']['work'] = [record['record']['recordData']['titles']['work']]

                    for work in record['record']['recordData']['titles']['work']:

                        if normalize_string(d[title_column]) in normalize_string(work['title']):
                            
                            d['author_viaf'] = record['record']['recordData']['viafID']
                            return d


        # if not matches on the VIAF but it has a LCCN then ask id.loc.gov for some titles to compare to
        for record in data['searchRetrieveResponse']['records']:

            if type(record['record']['recordData']['sources']['source']) != list:
                record['record']['recordData']['sources']['source'] = [record['record']['recordData']['sources']['source']]


            for source in record['record']['recordData']['sources']['source']:

                if source['#text'][0:3] == 'LC|':

                    lccn = source['#text'][3:].replace(' ','').strip()

                    url = 'https://id.loc.gov/resources/works/relationships/contributorto/'
                    params = {
                        'page': 0,
                        'label':lccn
                    }
                    headers={'Accept': 'application/json', 'User-Agent': user_agent}

                    r = requests.get(url,params=params,headers=headers)
                    try:
                        title_data = r.json()
                    except:
                        print("JSON decode error with:",d[id_column_name])
                        continue

                    if title_data['results'] != None:
                        # convert it to a list if it a single result dictonary
                        if type(title_data['results']) != list:
                            title_data['results'] = [title_data['results']]
                        for title in title_data['results']:
                            if normalize_string(d[title_column]) in normalize_string(title['label']):
                                # we found the title hit, use this one
                                d['author_viaf'] = record['record']['recordData']['viafID']
                                return d



        

    print("No results for ",d[id_column_name])
    
    time.sleep(pause_between_req)

    return d

def normalize_string(s):
    s = str(s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = " ".join(s.split())
    s = s.lower()
    s = s.casefold()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = s.replace('the','')
    return s




In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    print("Working on chunk ", idx, 'of', len(list_df))

    # if you want it to skip X number of chunks uncomment this
    # if idx < 88:
    #     continue

    list_df[idx] = list_df[idx].apply(lambda d: add_viaf(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')




In [None]:
#This last block just does QA on the data to see what if any rows were not populated

# df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)
# print("There are ", df['hathi_marc'].isnull().any().sum(), 'rows with no hathi_marc column populated, here are their', id_column_name, 'values:')
# res = df.loc[df['hathi_marc'].isnull(), id_column_name].tolist()
# print(print(res))


