# Wikidata Download - Reconcile Publisher

This script adds the Wikidata Q number from the Wikdiata API to a TSV file for the publisher

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It expects there to be a publisher name, you can extract that from the MARC with the script`parse_marc_add_publisher.ipynb`

It creates a new column in the file `wikidata_publisher_qid` `wikidata_publisher_name` which holds the wikidata QId

In [None]:
import pandas as pd
import requests
import time
import string
import unicodedata

id_column_author_qid## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`publisher_name_column` - the name of the column header that contains publisher name

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call



In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"

publisher_name_column = "publisher_marc"
user_agent = 'USER YOUR_USER_NAME - Test Script'
pause_between_req = 0
cache = {}

In [None]:
def add_qid(d):



    # if there is already a value skip it
    if 'wikidata_publisher_qid' in d:
        if type(d['wikidata_publisher_qid']) == str:        
            # print('Skip',d[id_column_name])
            return d

    if pd.isnull(d[publisher_name_column]) == True:
        return d

    if d[publisher_name_column] in cache:
        if cache[d[publisher_name_column]] == None:
            return d
        else:                
            d['wikidata_publisher_name'] = cache[d[publisher_name_column]]['name']
            d['wikidata_publisher_qid'] = cache[d[publisher_name_column]]['qid']
            return d


    # use the full search by default
    url = "https://www.wikidata.org/w/api.php"
    params = {
        'action':'query',
        'srsearch':d[publisher_name_column],
        'format':'json',
        'list':'search',
        'srlimit':'10'
    }
    headers = {
        'Accept' : 'application/json',
        'User-Agent': user_agent
    }
    r = requests.get(url, params=params, headers=headers)

    data = r.json()

    # make a list of the qids to use
    qids = []
    for s in data['query']['search']:
        qids.append(s['title'])

    total_hits = data['query']['searchinfo']['totalhits']


    if len(qids) == 0:
        return d

    # build the SPARQL query we are going to use
    qids_with_quotes = []
    for q in qids:
        qids_with_quotes.append(f'wd:{q}')

    sparql = f"""
        SELECT ?item ?itemLabel ?instance ?instanceLabel
        WHERE 
        {{

            VALUES ?item {{ { " ".join(qids_with_quotes)  }}}

            ?item wdt:P31 ?instance.
          
            # optional{{
            #   ?item wdt:P106 ?occupation.
            # }}
            # optional{{
            #   ?item wdt:P569 ?birth.
            # }}
            # optional{{
            #   ?item wdt:P570 ?death.
            # }}          
            # optional{{
            #   ?item wdt:P166 ?award.
            # }}    
            # optional{{
            #   ?item wdt:P214 ?viaf.
            # }}              
            # optional{{
            #   ?item wdt:P244 ?lccn.
            # }}
            # optional{{
            #   ?item wdt:P69 ?education.
            # }}


            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
    """
    params = {
        'query' : sparql
    }

    headers = {
        'Accept' : 'application/json',
        'User-Agent': user_agent
    }
    url = "https://query.wikidata.org/sparql"

    # print(sparql)
    r = requests.get(url, params=params, headers=headers)

    data = r.json()

    publishers = {}

    # did we get any results
    if len(data['results']['bindings']) > 0:
      for result in data['results']['bindings']:

        qid = result['item']['value'].split('/')[-1]
        

        if qid not in publishers:

          publishers[qid] = {
            'qid':qid,
            'score':100,
            'score_log':[],
            'instance':[],
            'name': result['itemLabel']['value']

            
          }
        
        if 'instanceLabel' in result:
          publishers[qid]['instance'].append(result['instanceLabel']['value'])
          publishers[qid]['instance'] = list(set(publishers[qid]['instance']))
       
      

    # they must have a writerly occupation to continue
    # print(publishers)
    for p in publishers:

      is_book_publisher = False      
      for instance in ['publisher', 'book publisher', 'imprint']:
        if instance in publishers[p]['instance']:
          is_book_publisher = True

    
      if is_book_publisher == False:
        # print(publishers[p]['name'], '==', d[publisher_name_column] )
        publishers[p]['score'] = -1000

      publishers[p]['score'] = publishers[p]['score'] - levenshtein(normalize_string(publishers[p]['name']),normalize_string(d[publisher_name_column]))
    
    best_publisher_score = 0
    best_publisher = None
    for p in publishers:
      if publishers[p]['score'] > best_publisher_score:
        best_publisher_score = publishers[p]['score']
        best_publisher = publishers[p]
    
    if best_publisher != None:

        d['wikidata_publisher_name'] = best_publisher['name']
        d['wikidata_publisher_qid'] = best_publisher['qid']

        cache[d[publisher_name_column]] = best_publisher
        print("For ",d[publisher_name_column], 'Best match is:', best_publisher['name'], "Cache:", len(cache))


    else:
        cache[d[publisher_name_column]] = None

    time.sleep(pause_between_req)



    return d

def normalize_string(s):
    s = str(s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = " ".join(s.split())
    s = s.lower()
    s = s.casefold()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = s.replace('the','')
    return s.strip()

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)


# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 500  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    # if idx < 25:
    #     continue

    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: add_qid(d),axis=1 )  
    

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')


