# Add missing LCCN / VIAF / Wikidata

This script adds the LCCN / VIAF / Wikidata identifier and Authorized Heading Label if one of the other identifiers is populated. 

It is a requirement that you have at least the LCCN or VIAF

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It creates a new column in the file `author_viaf` `author_viaf` `author_wikidata` `author_authorized_heading` which holds the missing values

In [None]:
import pandas as pd
import requests
import time


## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call

`author_viaf` - column where viaf number is stored

`author_lccn` - column where lccn  number is stored

`author_wikidata` - column where qid number is stored

`author_authorized_heading` - the column where the authorized name is stored



In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/nyt_hardcover_fiction_bestsellers-hathitrust_metadata.tsv"

user_agent = 'USER YOUR_USER_NAME - Test Script'
pause_between_req = 0

author_viaf = 'author_viaf'
author_lccn = 'author_lccn'
author_wikidata = 'wikidata_qid'
author_authorized_heading = 'author_authorized_heading'

wikidata_cache={}


In [None]:
def add_qid(d):

    if author_wikidata not in d:
        d[author_wikidata] = None

    if author_viaf not in d:
        d[author_viaf] = None

    if author_lccn not in d:
        d[author_lccn] = None

        
    # does it have LCCN but no VIAF 
    if pd.isnull(d[author_viaf]) == True and type(d[author_lccn]) == str:
        print("Try to get VIAF from LCCN", d[author_lccn])

        headers={'User-Agent': user_agent}
        url = f"https://viaf.org/viaf/sourceID/LC%7C{d[author_lccn]}"
        r = requests.get(url,headers=headers,allow_redirects=False)
        if r.status_code == 404:
            return d

        viaf = r.headers['Location'].split('/')[-1]
        print("Found VIAF viaf LCCN",viaf)
        d[author_viaf] = viaf

    # does it have VIAF but no LCCN
    if pd.isnull(d[author_viaf]) != True and pd.isnull(d[author_lccn]) == True:


        print("Try to get LCCN from VIAF", d[author_viaf])
        headers={'User-Agent': user_agent}
        url = f"https://www.viaf.org/viaf/{d[author_viaf]}/?httpAccept=application/json"
        r = requests.get(url,headers=headers)
        if r.status_code != 404:           
            
            data = r.json()
            if type(data['sources']['source']) != list:
                data['sources']['source'] = [data['sources']['source']]

            for source in data['sources']['source']:
                if source['#text'][0:3] == 'LC|':
                    print("FOOND LC!", source['#text'].replace(' ','').split('|')[1])
                    d[author_lccn] = source['#text'].replace(' ','').split('|')[1]

    # does it not have a wikidata but it has a lccn?
    if pd.isnull(d[author_wikidata]) == True and pd.isnull(d[author_lccn]) == False:

        if d[author_lccn] in wikidata_cache:
            d[author_wikidata] = wikidata_cache[d[author_lccn]]
        else:    


            sparql = f"""
                SELECT ?item ?itemLabel
                WHERE 
                {{
                ?item wdt:P244 "{d[author_lccn]}".
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
                }}
            """
            params = {
                'query' : sparql
            }

            headers = {
                'Accept' : 'application/json',
                'User-Agent': user_agent
            }
            url = "https://query.wikidata.org/sparql"

            r = requests.get(url, params=params, headers=headers)
            data = r.json()

            # did we get any results
            if len(data['results']['bindings']) > 0:
                # the qid is part of the URI, chop off the identifier       
                d[author_wikidata] = data['results']['bindings'][0]['item']['value'].split('/')[-1]
                wikidata_cache[d[author_lccn]] = d[author_wikidata] 
                print("Found wikidata via LCCN", d[author_wikidata])

    # does it not have a wikidata but it has a viaf?
    if pd.isnull(d[author_wikidata]) == True and pd.isnull(d[author_viaf]) == False:
        sparql = f"""
            SELECT ?item ?itemLabel
            WHERE 
            {{
            ?item wdt:P214 "{d[author_viaf]}".
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
            }}
        """
        params = {
            'query' : sparql
        }

        headers = {
            'Accept' : 'application/json',
            'User-Agent': user_agent
        }
        url = "https://query.wikidata.org/sparql"

        r = requests.get(url, params=params, headers=headers)
        data = r.json()

        # did we get any results
        if len(data['results']['bindings']) > 0:
            # the qid is part of the URI, chop off the identifier       
            d[author_wikidata] = data['results']['bindings'][0]['item']['value'].split('/')[-1]
            print("Found wikidata via VIAF", d[author_wikidata])
    
    # does not have the authorized heading but it has a LCCN?
    if pd.isnull(d[author_authorized_heading]) == True and pd.isnull(d[author_lccn]) == False:


        headers={'User-Agent': user_agent}
        url = f"https://id.loc.gov/authorities/names/suggest2/?q={d[author_lccn]}"
        r = requests.get(url,headers=headers)

        data = r.json()

        if data['count'] == 0:
            print("Bad LCCN",d[author_lccn] )

        d[author_authorized_heading] = data['hits'][0]['aLabel']



    # does not have the authorized heading but it has a VIAF?
    if pd.isnull(d[author_authorized_heading]) == True and pd.isnull(d[author_viaf]) == False:


        headers={'User-Agent': user_agent}
        url = f"https://viaf.org/viaf/{d[author_viaf]}/viaf.json"
        r = requests.get(url,headers=headers)
        if r.status_code != 404:                       
            data = r.json()

            # if the cluster is redirecting reload the new destination 
            if 'redirect' in data:
                url = f"https://viaf.org/viaf/{data['redirect']['directto']}/viaf.json"
                r = requests.get(url,headers=headers)
                data = r.json()
            
            # dunno what this is, sometimes the record is nested in this key
            if 'scavenged' in data:
                data = data['scavenged']['VIAFCluster']

            if type(data['mainHeadings']['data']) != list:
                data['mainHeadings']['data'] = [data['mainHeadings']['data']]

            d[author_authorized_heading] = data['mainHeadings']['data'][0]['text']


    time.sleep(pause_between_req)

    return d

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)


# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    # if you want it to skip X number of chunks uncomment this, the number is the row to skip to
    # if idx < 10:
    #     continue


    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: add_qid(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')




In [None]:
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)

df_dropped = df.dropna(subset=[author_viaf,author_lccn,author_wikidata], how='all')

print('Number of rows with one identifier populated:',len(df_dropped.index), 'out of ',len(df.index), len(df_dropped.index)/len(df.index)*100 )

df_dropped = df.dropna(subset=[author_viaf], how='all')
print('Number of rows with author_viaf populated:',len(df_dropped.index), 'out of ',len(df.index), len(df_dropped.index)/len(df.index)*100 )

df_dropped = df.dropna(subset=[author_lccn], how='all')
print('Number of rows with author_lccn populated:',len(df_dropped.index), 'out of ',len(df.index), len(df_dropped.index)/len(df.index)*100 )

df_dropped = df.dropna(subset=[author_wikidata], how='all')
print('Number of rows with author_wikidata populated:',len(df_dropped.index), 'out of ',len(df.index), len(df_dropped.index)/len(df.index)*100 )

