# Process
In this notebook, I will run SparQL to get all the WikiData entries that have a Viaf ID. I will use Qwikidata, a python wrapper for WikiData API.

## Import libraries

In [1]:
import time
from qwikidata.sparql  import return_sparql_query_results
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter

## Run SparQL query

In [10]:
query_string = """
        SELECT DISTINCT ?item ?viaf # return QID and VIAF ID
WHERE 
{ ?item wdt:P214 ?viaf. # select all the items in WikiData that have VIAF ID

}
        """
start = time.time()
items_with_viaf = return_sparql_query_results(query_string) # run the query and get results
print("It took "+str(np.round(time.time()-start,2))+ " seconds.")

It took 98.14 seconds.


## Process results

In [11]:
items_with_viaf = pd.DataFrame(items_with_viaf["results"]["bindings"])

In [12]:
items_with_viaf["item"] = items_with_viaf["item"].apply(lambda x: x["value"].split("/")[-1]) # keep only QID
items_with_viaf["viaf"] = items_with_viaf["viaf"].apply(lambda x: x["value"]) # keep only VIAF ID

In [13]:
# for index, row in items_with_viaf.iterrows():
#     try:
#         if row["ethnicity"]!="":
#             replacement = row["ethnicity"]["value"].split("/")[-1]
#             if replacement[0] == "Q":
#                 items_with_viaf.at[index, "ethnicity"] = replacement
#             else:
#                 items_with_viaf.at[index, "ethnicity"] = ""
#     except:
#         print(row["ethnicity"])

In [14]:
items_with_viaf.columns = ["QID", "ViafID"]

In [15]:
items_with_viaf.head()

Unnamed: 0,QID,ViafID
0,Q270705,16565
1,Q289693,135150789
2,Q481146,14775085
3,Q507746,6650
4,Q568833,158264550


In [16]:
print("In WikiData, there are "+str(len(items_with_viaf))+" items with VIAF ID.")

In WikiData, there are 2804424 items with VIAF ID.


## Save results

In [17]:
# save with Pickle
with open("data/items_with_viaf_wikidata.pkl", "wb") as output_file:
    pkl.dump(items_with_viaf, output_file)