# Author LCCN Download

This script will talk to id.loc.gov to find the LCCN for a name and title 

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It creates a new column in the file `author_lccn` with the LCCN value.

In [None]:
import pandas as pd
import requests
import time
import string
import unicodedata


## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call


In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/major_literary_prizes-winners_judges.tsv"
user_agent = 'YOUR PROJECT NAME HERE'
pause_between_req = 0


In [None]:
def add_lccn(d):
    
       
    print(d['full_name'])
    
    if there is already a value skip it
    if 'author_lccn' in d:
        if type(d['author_lccn']) == str:        
            print('Skip already has lccn',d['author_lccn'], d['full_name'])
            return d

    if type(d['title_of_winning_book']) != str:        
        print('Skip - no title',d['full_name'], d['title_of_winning_book'])
        return d

    if d['role'] != 'winner':        
        print('Skip - not winner ',d['full_name'], d['role'])
        return d

    if d['full_name'] == 'No Winner':        
        print('Skip - No Winner',d['full_name'])
        return d
    
  
    name = f"{d['last_name']}, {d['given_name']}"

    # drop any trailing commas or periods
    if name[-1] == '.'  or name[-1] == ',':
        name = name[:-1]


    params = {
        'q' : name,
        'count': 5
    }
    headers={'Accept': 'application/json', 'User-Agent': user_agent}
    url = f"https://id.loc.gov/authorities/names/suggest2/"

    r = requests.get(url,params=params,headers=headers)
    try:
        data = r.json()
    except:
        print("JSON decode error with:",d['full_name'])
        return d            

    results = data['hits']

    # if we are here then no match, loop again and look at the titles if enabled
    for hit in results:
        url = 'https://id.loc.gov/resources/works/relationships/contributorto/'
        params = {
            'page': 0,
            'label':hit['aLabel']
        }
        headers={'Accept': 'application/json', 'User-Agent': user_agent}

        r = requests.get(url,params=params,headers=headers)
        try:
            title_data = r.json()
        except:
            print("JSON decode error with:",d['full_name'])
            return d

        if title_data['results'] != None:
            # convert it to a list if it a single result dictonary
            if type(title_data['results']) != list:
                title_data['results'] = [title_data['results']]
            for title in title_data['results']:
                if normalize_string(d['title_of_winning_book']) in normalize_string(title['label']):
                    # we found the title hit, use this one
                    d['author_lccn'] = hit['uri'].split('/')[-1]
                    d['match_score'] = 'id title match'
                    
                    print("Found!", d['title_of_winning_book'], 'in', title['label'], ' for ', d['full_name'] )
                    return d


        

    print("No results for ",d['full_name'])
    
    time.sleep(pause_between_req)

    return d

def normalize_string(s):
    s = str(s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = " ".join(s.split())
    s = s.lower()
    s = s.casefold()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = s.replace('the','')
    return s





In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    print("Working on chunk ", idx, 'of', len(list_df))

    # if you want it to skip X number of chunks uncomment this, the number is the row to skip to
    # if idx < 88:
    #     continue

    list_df[idx] = list_df[idx].apply(lambda d: add_lccn(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')


