# Wikidata Download Qid

This script adds the Wikidata Q number from the Wikdiata API to a TSV file. 

It is a requirement that you have the LCCN number in the data

This script modifies the TSV file itself in batches, should the script timeout or other error you can rerun it and it will pickup where it left off, always run it on a backup of your orginal data files.

It creates a new column in the file `wikidata_qid` which holds the MARC XML 

In [None]:
import pandas as pd
import requests
import time


## Config
Set these variables below based on your setup

`path_to_tsv` - the path to the TSV file you want to run it on

`id_column_name` - the name of the column header that contains the lccn

`user_agent` - this is the value put into the headers on each request, it is good practice to identifiy your client/project when working with open free APIs

`pause_between_req` - number of seconds to wait between each API call



In [None]:
path_to_tsv = "/Users/m/Downloads/data-tmp/hathitrust_post45fiction_metadata.tsv"
id_column_name = "author_lccn"
user_agent = 'USER thisismattmiller - Test Script'
pause_between_req = 0


In [None]:
def add_qid(d):



    # if there is already a value skip it
    if 'wikidata_qid' in d:
        if type(d['wikidata_qid']) == str:        
            print('Skip',d[id_column_name])
            return d

    # # if you want to add any logic to only download some records add it here
    # if type(d['author_lccn']) != str:
    #     print('Skip no LCCN')
    #     return d

    sparql = f"""
        SELECT ?item ?itemLabel
        WHERE 
        {{
        ?item wdt:P244 "{d[id_column_name]}".
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
    """

    params = {
        'query' : sparql
    }

    headers = {
        'Accept' : 'application/json',
        'User-Agent': user_agent
    }
    url = "https://query.wikidata.org/sparql"


    r = requests.get(url, params=params, headers=headers)

    data = r.json()

    # did we get any results
    if len(data['results']['bindings']) > 0:
        # the qid is part of the URI, chop off the identifier       
        d['wikidata_qid'] = data['results']['bindings'][0]['item']['value'].split('/')[-1]
    

    time.sleep(pause_between_req)

    return d

In [None]:
# load the tsv
df = pd.read_csv(path_to_tsv, sep='\t', header=0, low_memory=False)


# we are going to split the dataframe into chunks so we can save our progress as we go but don't want to save the entire file on on every record operation
n = 100  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]

# loop through each chunk
for idx, df_chunk in enumerate(list_df):

    print("Working on chunk ", idx, 'of', len(list_df))
    list_df[idx] = list_df[idx].apply(lambda d: add_qid(d),axis=1 )  

    reformed_df = pd.concat(list_df)
    reformed_df.to_csv(path_to_tsv, sep='\t')


