# Filtering records using SPARQL

In [20]:
import sys
sys.path.append("..")
import pandas as pd
import re
from heritageconnector.utils.sparql import get_sparql_results

endpoint_url = "https://query.wikidata.org/sparql"

df = pd.read_pickle("../GITIGNORE_DATA/lookup_result.pkl")
people_df = df[df['GENDER'].isin(["M", "F"])]

len(people_df)

10352

In [66]:
map_ids = lambda ids: ", ".join([f"wd:{i}" for i in ids])
map_ids_values = lambda ids: " ".join([f"(wd:{i})" for i in ids])

def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel
            WHERE
            {{
                VALUES (?item) {{ {map_ids_values(query_ids)} }}
                ?item wdt:P31{class_tree} wd:{property_id}.
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")

                SERVICE wikibase:label {{ 
                  bd:serviceParam wikibase:language "en" .
                }}
            }} 
    GROUP BY ?item ?itemLabel ?altLabel
    """
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value']]
    
    return res_df

In [62]:
qcodes_unique = [item for item in list(set(people_df['res_WIKIDATA_IDs'].sum())) if str(item).startswith("Q")]
len(qcodes_unique)

3620

In [69]:
# error 500: timeout if too many records. need to paginate
qcodes_query = qcodes_unique#[0:2000]
import time
start = time.time()
res_df = return_labels_aliases_by_property(qcodes_query, "Q5", include_class_tree=False)
print(time.time() - start)

8.351809978485107


In [70]:
res_df

Unnamed: 0,qcode,itemLabel.value,altLabel.value
0,Q517,Napoleon,Napoleon I
1,Q517,Napoleon,Emperor of the French Napoleon I
2,Q517,Napoleon,Napoléon Bonaparte
3,Q517,Napoleon,Le Petit Caporal
4,Q517,Napoleon,Napoleone Buonaparte
...,...,...,...
9338,Q2938863,Carl Angst,Carl-Albert Angst
9339,Q2938914,Carl Frederik von Breda,Carl Fredric von Breda
9340,Q2938914,Carl Frederik von Breda,Carl Fredrik van Breda
9341,Q2938914,Carl Frederik von Breda,Carl Fredrik von Breda


In [71]:
def get_aliases(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'altLabel.value'].unique().tolist() for qcode in qcodes]

def get_labels(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'itemLabel.value'].unique().tolist() for qcode in qcodes]

get_labels(["Q762", "Q55021352"])

[['Leonardo da Vinci'], ['Freiherr Christian Leopold von Buch']]