# Filtering records using SPARQL

In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import sys
sys.path.append("..")
import pandas as pd
import re
from fuzzywuzzy import fuzz
from heritageconnector.utils.sparql import get_sparql_results

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

endpoint_url = "https://query.wikidata.org/sparql"

df = pd.read_pickle("../GITIGNORE_DATA/lookup_result.pkl")
people_df = df[df['GENDER'].isin(["M", "F"])]

len(people_df)

10352

In [18]:
map_ids = lambda ids: ", ".join([f"wd:{i}" for i in ids])
map_ids_values = lambda ids: " ".join([f"(wd:{i})" for i in ids])

def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel
            WHERE
            {{
                VALUES (?item) {{ {map_ids_values(query_ids)} }}
                ?item wdt:P31{class_tree} wd:{property_id}.
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")

                SERVICE wikibase:label {{ 
                  bd:serviceParam wikibase:language "en" .
                }}
            }} 
    GROUP BY ?item ?itemLabel ?altLabel
    """
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value']]
    # convert aliases to lowercase and drop duplicates
    res_df['altLabel.value'] = res_df['altLabel.value'].astype(str).str.lower()
    res_df = res_df.drop_duplicates(subset = 'altLabel.value')
    
    return res_df

In [19]:
qcodes_unique = [item for item in list(set(people_df['res_WIKIDATA_IDs'].sum())) if str(item).startswith("Q")]
len(qcodes_unique)

3620

In [20]:
qcodes_query = qcodes_unique
import time
start = time.time()
res_df = return_labels_aliases_by_property(qcodes_query, "Q5", include_class_tree=False)
print(time.time() - start)

17.793995141983032


In [21]:
res_df

Unnamed: 0,qcode,itemLabel.value,altLabel.value
0,Q762,Leonardo da Vinci,leonardi de vinci
1,Q762,Leonardo da Vinci,leonardi devinci
2,Q762,Leonardo da Vinci,leonardi di vinci
3,Q762,Leonardo da Vinci,leonardo d'avinci
4,Q762,Leonardo da Vinci,leonardo d'vinci
...,...,...,...
9359,Q901129,Jean-Baptiste L. Romé de l'Isle,rome de l´isle j b l
9360,Q901532,John Bennet Lawes,"sir john bennet lawes, 1st baronet"
9361,Q901532,John Bennet Lawes,"sir john bennet lawes, 1st bt."
9362,Q901872,Alexandre-Émile Béguyer de Chancourtois,alexandre-emile beguyer de chancourtois


In [22]:
def get_aliases(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'altLabel.value'].tolist() for qcode in qcodes]

def get_labels(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'itemLabel.value'].unique().tolist() for qcode in qcodes]

#get_labels(["Q762", "Q55021352"]), get_aliases(["Q762", "Q55021352"])

### integrating into heritageconnector

In [23]:
from heritageconnector.entity_matching.filter import Filter

f = Filter(dataframe=people_df, qcode_col="res_WIKIDATA_IDs")
f.add_instanceof_filter("Q5", False)
f.add_label_filter("PREFERRED_NAME", threshold=90, include_aliases=True, fuzzy_match_scorer=fuzz.token_sort_ratio)
f.process_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Added filter {'instance_of': {'property_id': 'Q5', 'include_class_tree': False}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'include_aliases': True, 'threshold': 90, 'fuzzy_match_scorer': <function token_sort_ratio at 0x1254b2290>}}
Running Wikidata query..


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
 18%|█▊        | 615/3476 [00:00<00:00, 6148.72it/s]

Applying filters...
Filter: instance of Q5


100%|██████████| 3476/3476 [00:00<00:00, 6113.32it/s]
  3%|▎         | 88/3326 [00:00<00:07, 440.29it/s]

Filter: check label similarity against column PREFERRED_NAME


100%|██████████| 3326/3326 [00:07<00:00, 444.82it/s]


In [24]:
new_df = f.get_dataframe()
len(new_df[(new_df['qcodes_filtered'].map(lambda d: len(d)) > 0)]) / len(people_df)

0.2752125193199382

In [25]:
org_df = df[df['GENDER'] == "N"]
fo = Filter(dataframe=org_df, qcode_col="res_WIKIDATA_IDs")
fo.add_instanceof_filter("Q43229", True)
fo.add_label_filter("PREFERRED_NAME", threshold=80, include_aliases=True, fuzzy_match_scorer=fuzz.token_set_ratio)
fo.process_dataframe()
org_res_df = fo.get_dataframe()

Added filter {'instance_of': {'property_id': 'Q43229', 'include_class_tree': True}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'include_aliases': True, 'threshold': 80, 'fuzzy_match_scorer': <function token_set_ratio at 0x1254b24d0>}}
Running Wikidata query..


100%|██████████| 1142/1142 [00:00<00:00, 8135.52it/s]
  0%|          | 0/807 [00:00<?, ?it/s]

Applying filters...
Filter: instance of Q43229
Filter: check label similarity against column PREFERRED_NAME


100%|██████████| 807/807 [00:00<00:00, 930.99it/s]


In [26]:
num_orgs_after_filter = len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)])
num_orgs_after_filter, len(org_df), num_orgs_after_filter / len(org_df)

(697, 7743, 0.09001678935812993)

In [27]:
# no. organisations with more than one qcode remaining
len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 1)])

0

In [28]:
ids_less_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [29]:
ids_more_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [30]:
org_res_df.columns

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE',
       'res_ALL_NOTES', 'res_WIKIDATA_IDs', 'res_URLS', 'qcodes_filtered'],
      dtype='object')

In [16]:
org_res_df.loc[list(set(ids_less_strict) - set(ids_more_strict)), ["PREFERRED_NAME", "NOTE", "res_WIKIDATA_IDs", "res_URLS", "qcodes_filtered"]]

Unnamed: 0,PREFERRED_NAME,NOTE,res_WIKIDATA_IDs,res_URLS,qcodes_filtered
