# Filtering records using SPARQL

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")
import pandas as pd
import re
from fuzzywuzzy import fuzz
from heritageconnector.utils.sparql import get_sparql_results

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

endpoint_url = "https://query.wikidata.org/sparql"

df = pd.read_pickle("../GITIGNORE_DATA/lookup_result.pkl")
people_df = df[df['GENDER'].isin(["M", "F"])]

len(people_df)

10352

In [3]:
map_ids = lambda ids: ", ".join([f"wd:{i}" for i in ids])
map_ids_values = lambda ids: " ".join([f"(wd:{i})" for i in ids])

def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel
            WHERE
            {{
                VALUES (?item) {{ {map_ids_values(query_ids)} }}
                ?item wdt:P31{class_tree} wd:{property_id}.
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")

                SERVICE wikibase:label {{ 
                  bd:serviceParam wikibase:language "en" .
                }}
            }} 
    GROUP BY ?item ?itemLabel ?altLabel
    """
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value']]
    # convert aliases to lowercase and drop duplicates
    res_df['altLabel.value'] = res_df['altLabel.value'].astype(str).str.lower()
    res_df = res_df.drop_duplicates(subset = 'altLabel.value')
    
    return res_df

In [4]:
qcodes_unique = [item for item in list(set(people_df['res_WIKIDATA_IDs'].sum())) if str(item).startswith("Q")]
len(qcodes_unique)

3620

In [5]:
qcodes_query = qcodes_unique
import time
start = time.time()
res_df = return_labels_aliases_by_property(qcodes_query, "Q5", include_class_tree=False)
print(time.time() - start)

8.15197205543518


In [6]:
res_df

Unnamed: 0,qcode,itemLabel.value,altLabel.value
0,Q762,Leonardo da Vinci,leonardi de vinci
1,Q762,Leonardo da Vinci,leonardi devinci
2,Q762,Leonardo da Vinci,leonardi di vinci
3,Q762,Leonardo da Vinci,leonardo d'avinci
4,Q762,Leonardo da Vinci,leonardo d'vinci
...,...,...,...
9359,Q1074290,Paul Gavarni,sulpice guillaume chevalier
9360,Q1074290,Paul Gavarni,sulpice-guillaume chevalier
9361,Q1074290,Paul Gavarni,sulpice-paul chevalier
9362,Q1074290,Paul Gavarni,william chevalier


In [7]:
def get_aliases(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'altLabel.value'].tolist() for qcode in qcodes]

def get_labels(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'itemLabel.value'].unique().tolist() for qcode in qcodes]

#get_labels(["Q762", "Q55021352"]), get_aliases(["Q762", "Q55021352"])

### integrating into heritageconnector

In [9]:
from heritageconnector.entity_matching.filtering import Filter

f = Filter(dataframe=people_df, qcode_col="res_WIKIDATA_IDs")
f.add_instanceof_filter("Q5", False)
f.add_label_filter("PREFERRED_NAME", threshold=90, include_aliases=True, fuzzy_match_scorer=fuzz.token_sort_ratio)
f.process_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Added filter {'instance_of': {'property_id': 'Q5', 'include_class_tree': False}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'include_aliases': True, 'threshold': 90, 'fuzzy_match_scorer': <function token_sort_ratio at 0x11ae477a0>}}
Running Wikidata query..


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
 16%|█▋        | 572/3476 [00:00<00:00, 5713.53it/s]

Applying filters...
Filter: instance of Q5


100%|██████████| 3476/3476 [00:00<00:00, 5904.51it/s]
  3%|▎         | 89/3326 [00:00<00:07, 433.86it/s]

Filter: check label similarity against column PREFERRED_NAME


100%|██████████| 3326/3326 [00:07<00:00, 454.54it/s]


In [10]:
new_df = f.get_dataframe()
len(new_df[(new_df['qcodes_filtered'].map(lambda d: len(d)) > 0)]) / len(people_df)

0.2752125193199382

In [11]:
org_df = df[df['GENDER'] == "N"]
fo = Filter(dataframe=org_df, qcode_col="res_WIKIDATA_IDs")
fo.add_instanceof_filter("Q43229", True)
fo.add_label_filter("PREFERRED_NAME", threshold=80, include_aliases=True, fuzzy_match_scorer=fuzz.token_set_ratio)
fo.process_dataframe()
org_res_df = fo.get_dataframe()

Added filter {'instance_of': {'property_id': 'Q43229', 'include_class_tree': True}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'include_aliases': True, 'threshold': 80, 'fuzzy_match_scorer': <function token_set_ratio at 0x11ae479e0>}}
Running Wikidata query..


100%|██████████| 1142/1142 [00:00<00:00, 7947.82it/s]
 12%|█▏        | 94/807 [00:00<00:00, 939.15it/s]

Applying filters...
Filter: instance of Q43229
Filter: check label similarity against column PREFERRED_NAME


100%|██████████| 807/807 [00:00<00:00, 963.59it/s]


In [12]:
num_orgs_after_filter = len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)])
num_orgs_after_filter, len(org_df), num_orgs_after_filter / len(org_df)

(697, 7743, 0.09001678935812993)

In [13]:
# no. organisations with more than one qcode remaining
len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 1)])

0

In [14]:
ids_less_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [15]:
ids_more_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [16]:
org_res_df.columns

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE',
       'res_ALL_NOTES', 'res_WIKIDATA_IDs', 'res_URLS', 'qcodes_filtered'],
      dtype='object')

In [17]:
org_res_df.loc[list(set(ids_less_strict) - set(ids_more_strict)), ["PREFERRED_NAME", "NOTE", "res_WIKIDATA_IDs", "res_URLS", "qcodes_filtered"]]

Unnamed: 0,PREFERRED_NAME,NOTE,res_WIKIDATA_IDs,res_URLS,qcodes_filtered


In [20]:
fo.view_stats()

No. records after filtering: 697/7743 (9.0%)


### filtering with birth and death dates

In [44]:
def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel ?birthYear ?deathYear ?inceptionYear ?dissolvedYear WHERE {{
      VALUES (?item) {{ {map_ids_values(query_ids)} }}
      ?item wdt:P31{class_tree} wd:{property_id}.
      OPTIONAL{{
        ?item wdt:P569 ?birthDate.
        BIND( year(?birthDate) AS ?birthYear )
        }}
      OPTIONAL {{
        ?item wdt:P570 ?deathDate.
        BIND( year(?deathDate) AS ?deathYear )
        }}
      OPTIONAL {{
        ?item wdt:P571 ?inceptionDate.
        BIND( year(?inceptionDate) AS ?inceptionYear )
        }}
      OPTIONAL {{
        ?item wdt:P576 ?dissolvedDate.
        BIND( year(?dissolvedDate) AS ?dissolvedYear )  
        }}
      OPTIONAL {{
        ?item skos:altLabel ?altLabel.
        FILTER (lang(?altLabel) = "en")
        }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    
    final_cols = ['qcode', 'itemLabel.value', 'altLabel.value', 'birthYear.value', 'deathYear.value', 'inceptionYear.value', 'dissolvedYear.value']
    cols_missing = set(final_cols) - set(res_df.columns.values.tolist())
    for col in cols_missing:
        res_df[col] = ""
        
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value', 'birthYear.value', 'deathYear.value', 'inceptionYear.value', 'dissolvedYear.value']]
    # convert aliases to lowercase and drop duplicates
    res_df['altLabel.value'] = res_df['altLabel.value'].astype(str).str.lower()
    res_df = res_df.drop_duplicates()
    
    return res_df



In [45]:
query_ids = ["Q167877", "Q46633", "Q312", "Q920016", "Q41421"]
return_labels_aliases_by_property(query_ids, "Q5", False) # humans

Unnamed: 0,qcode,itemLabel.value,altLabel.value,birthYear.value,deathYear.value,inceptionYear.value,dissolvedYear.value
0,Q41421,Michael Jordan,air jordan,1963,,,
1,Q41421,Michael Jordan,his airness,1963,,,
2,Q41421,Michael Jordan,michael jeffrey jordan,1963,,,
3,Q167877,Michael Jackson,,1942,2007.0,,
4,Q46633,Charles Babbage,,1791,1871.0,,


In [61]:
f = Filter(dataframe=people_df, qcode_col="res_WIKIDATA_IDs")
f.qcodes_unique
f._run_wikidata_query(f.qcodes_unique, instanceof_filter=True, property_id="Q5", include_class_tree=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,qcode,label,alias,birthYear.value,deathYear.value,inceptionYear.value,dissolvedYear.value
0,Q123679,Abraham-Louis Breguet,abraham louis breguet,1747,1823,,
1,Q123679,Abraham-Louis Breguet,abraham-louis bréguet,1747,1823,,
2,Q123900,Johann Jakob Scheuchzer,j.j.scheuchzer,1672,1733,,
3,Q123918,Johann Kaspar Lavater,j. c. lavater,1741,1801,,
5,Q123918,Johann Kaspar Lavater,johann caspar lavater,1741,1801,,
...,...,...,...,...,...,...,...
12306,Q65317,Godfrey Kneller,st. g. kneller,1646,1723,,
12307,Q65317,Godfrey Kneller,v.g. kneller,1646,1723,,
12308,Q65454,Erich Salomon,e. salomon,1886,1944,,
12309,Q65454,Erich Salomon,erich franz emil salomon,1886,1944,,
