# Filtering records using SPARQL

In [20]:
import sys
sys.path.append("..")
import pandas as pd
import re
from heritageconnector.utils.sparql import get_sparql_results

endpoint_url = "https://query.wikidata.org/sparql"

df = pd.read_pickle("../GITIGNORE_DATA/lookup_result.pkl")
people_df = df[df['GENDER'].isin(["M", "F"])]

len(people_df)

10352

In [33]:
map_ids = lambda ids: ", ".join([f"wd:{i}" for i in ids])

def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel
            WHERE
            {{
                ?item wdt:P31{class_tree} wd:{property_id}.
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")
                FILTER (?item IN ({map_ids(query_ids)}))

                SERVICE wikibase:label {{ 
                  bd:serviceParam wikibase:language "en" .
                }}
            }} 
    GROUP BY ?item ?itemLabel ?altLabel
    """

    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value']]
    
    return res_df

In [34]:
qcodes_unique = [item for item in list(set(people_df['res_WIKIDATA_IDs'].sum())) if str(item).startswith("Q")]
len(qcodes_unique)

3620

In [35]:
# error 500: timeout if too many records. need to paginate
qcodes_query = qcodes_unique[0:2000]
import time
start = time.time()
res_df = return_labels_aliases_by_property(qcodes_query, "Q5", include_class_tree=False)
print(time.time() - start)

18.83329677581787


In [36]:
res_df

Unnamed: 0,qcode,itemLabel.value,altLabel.value
0,Q675,André-Marie Ampère,Andre-Marie Ampere
1,Q762,Leonardo da Vinci,Vinci
2,Q762,Leonardo da Vinci,Leonardo
3,Q762,Leonardo da Vinci,Leonard
4,Q762,Leonardo da Vinci,Leonardo Da Vinci
...,...,...,...
4356,Q55008046,Albert I of Belgium,King Albert I
4357,Q55008046,Albert I of Belgium,King Albert I of Belgium
4358,Q55021352,Freiherr Christian Leopold von Buch,Buch
4359,Q55021352,Freiherr Christian Leopold von Buch,Christian Leopold von Buch


In [41]:
def get_aliases(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'altLabel.value'].unique().tolist() for qcode in qcodes]

def get_labels(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'itemLabel.value'].unique().tolist() for qcode in qcodes]

get_labels(["Q762", "Q55021352"])

[['Leonardo da Vinci'], ['Freiherr Christian Leopold von Buch']]