# Filtering records using SPARQL

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")
import pandas as pd
import re
from heritageconnector.utils.sparql import get_sparql_results

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

endpoint_url = "https://query.wikidata.org/sparql"

df = pd.read_pickle("../GITIGNORE_DATA/lookup_result.pkl")
people_df = df[df['GENDER'].isin(["M", "F"])]

len(people_df)

10352

In [3]:
map_ids = lambda ids: ", ".join([f"wd:{i}" for i in ids])
map_ids_values = lambda ids: " ".join([f"(wd:{i})" for i in ids])

def return_labels_aliases_by_property(query_ids, property_id, include_class_tree):
    
    class_tree = "/wdt:P279*" if include_class_tree else ""
        
    query = f"""
    SELECT ?item ?itemLabel ?altLabel
            WHERE
            {{
                VALUES (?item) {{ {map_ids_values(query_ids)} }}
                ?item wdt:P31{class_tree} wd:{property_id}.
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")

                SERVICE wikibase:label {{ 
                  bd:serviceParam wikibase:language "en" .
                }}
            }} 
    GROUP BY ?item ?itemLabel ?altLabel
    """
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    
    res_df = pd.json_normalize(res)
    res_df['qcode'] = res_df['item.value'].apply(lambda x: re.findall(r"(Q\d+)", x)[0])
    res_df = res_df[['qcode', 'itemLabel.value', 'altLabel.value']]
    # convert aliases to lowercase and drop duplicates
    res_df['altLabel.value'] = res_df['altLabel.value'].astype(str).str.lower()
    res_df = res_df.drop_duplicates(subset = 'altLabel.value')
    
    return res_df

In [4]:
qcodes_unique = [item for item in list(set(people_df['res_WIKIDATA_IDs'].sum())) if str(item).startswith("Q")]
len(qcodes_unique)

3620

In [5]:
qcodes_query = qcodes_unique
import time
start = time.time()
res_df = return_labels_aliases_by_property(qcodes_query, "Q5", include_class_tree=False)
print(time.time() - start)

6.129828929901123


In [6]:
res_df

Unnamed: 0,qcode,itemLabel.value,altLabel.value
0,Q762,Leonardo da Vinci,leonardi de vinci
1,Q762,Leonardo da Vinci,leonardi devinci
2,Q762,Leonardo da Vinci,leonardi di vinci
3,Q762,Leonardo da Vinci,leonardo d'avinci
4,Q762,Leonardo da Vinci,leonardo d'vinci
...,...,...,...
9340,Q472095,Warren De la Rue,warren de la rue
9341,Q472095,Warren De la Rue,"de la rue, warren"
9344,Q472510,Ernst Alexanderson,ernst frederick werner alexanderson
9345,Q472639,William Jackson Hooker,sir william jackson hooker


In [9]:
def get_aliases(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'altLabel.value'].tolist() for qcode in qcodes]

def get_labels(qcodes):
    return [res_df.loc[res_df['qcode'] == qcode, 'itemLabel.value'].unique().tolist() for qcode in qcodes]

#get_labels(["Q762", "Q55021352"]), get_aliases(["Q762", "Q55021352"])

### integrating into heritageconnector

In [66]:
from heritageconnector.entity_matching.filter import Filter

f = Filter(dataframe=people_df, qcode_col="res_WIKIDATA_IDs")
f.add_instanceof_filter("Q5", False)
f.add_label_filter("PREFERRED_NAME", threshold=90, include_aliases=True)
f.process_dataframe()

Added filter {'instance_of': {'property_id': 'Q5', 'include_class_tree': False}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'threshold': 90, 'include_aliases': True}}
Running Wikidata query..


 18%|█▊        | 623/3476 [00:00<00:00, 6210.49it/s]

Applying filters...
Filter: instance of Q5


100%|██████████| 3476/3476 [00:00<00:00, 6580.09it/s]
  3%|▎         | 85/3326 [00:00<00:07, 421.67it/s]

Filter: check label similarity against column PREFERRED_NAME


100%|██████████| 3326/3326 [00:07<00:00, 441.79it/s]


In [67]:
new_df = f.get_dataframe()
len(new_df[(new_df['qcodes_filtered'].map(lambda d: len(d)) > 0)]) / len(people_df)

0.30863601236476046

In [None]:
org_df = df[df['GENDER'] == "N"]
fo = Filter(dataframe=org_df, qcode_col="res_WIKIDATA_IDs")
fo.add_instanceof_filter("Q43229", True)
fo.add_label_filter("PREFERRED_NAME", threshold=80, include_aliases=True)
fo.process_dataframe()
org_res_df = fo.get_dataframe()

Added filter {'instance_of': {'property_id': 'Q43229', 'include_class_tree': True}}
Added filter {'label': {'label_col': 'PREFERRED_NAME', 'threshold': 80, 'include_aliases': True}}
Running Wikidata query..


In [None]:
num_orgs_after_filter = len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)])
num_orgs_after_filter, len(org_df), num_orgs_after_filter / len(org_df)

In [65]:
# no. organisations with more than one qcode remaining
len(org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 1)])

0

In [61]:
ids_less_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [47]:
ids_more_strict = org_res_df[(org_res_df['qcodes_filtered'].map(lambda d: len(d)) > 0)].index.tolist()

In [57]:
org_res_df.columns

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE',
       'res_ALL_NOTES', 'res_WIKIDATA_IDs', 'res_URLS', 'qcodes_filtered'],
      dtype='object')

In [62]:
org_res_df.loc[list(set(ids_less_strict) - set(ids_more_strict)), ["PREFERRED_NAME", "NOTE", "res_WIKIDATA_IDs", "res_URLS", "qcodes_filtered"]]

Unnamed: 0,PREFERRED_NAME,NOTE,res_WIKIDATA_IDs,res_URLS,qcodes_filtered
14337,Thomson SA,"Thomson SA (Société Anonyme) grew out of Thomson-CSF, nationalised by the French government in 1982. In 1988 Thomson Consumer Electronics was formed, renamed Thomson Multimedia in 1995. The French government split the consumer electronics and defence businesses prior to privatization in 1999, those companies being Thomson-CSF (today Thales Group) and Thomson Multimedia (later renamed Thomson SA). Thomson SA has steadily moved out of consumer electronics manufacturing; in 2004 Thomson set up a joint venture (TTE) with China's TCL, giving to TCL all manufacturing of RCA and Thomson television and DVD products and making TCL the global leader in TV manufacturing. (Thomson still controls the brands themselves and licenses them to TTE). In December 2006 Thomson SA sold off its Audio/Video and Accessories businesses (which included all its consumer electronics under the RCA and Thomson brands, except TVs, (now part of TTE)) and communications products to Audiovox. In 2007, Thomson SA agreed to sell its consumer electronics audio video business outside Europe including the worldwide rights to the RCA Brand for consumer electronics audio video products. Instead Thomson SA has moved into the into broadcast management, facilities and services market, taking over Technicolor from Carlton Television, Corinthian the Television and Moving Picture Company from ITV in the early 2000s. In September 2005 Thomson first showed its revolutionary Infinity camcorder. At the April 2006 launch this was not described as a camera at all but instead described as 'a new line of IT-based acquisition, recording and storage devices'.",[Q1365773],"[http://www.thomson.net/GlobalEnglish/Corporate/About/History/Pages/default.aspx, http://en.wikipedia.org/wiki/Thomson_SA]",[Q1365773]
19,E K Cole Limited,"ODNB: Rowland F. Pocock, ‘Cole, Eric Kirkham (1901–1966)’, Oxford Dictionary of National Biography, Oxford University Press, 2004 [http://www.oxforddnb.com/view/article/46833, accessed 31 Dec 2008]\nWIKI: http://en.wikipedia.org/wiki/Ekco 24 May 2009","[Q18528774, Q5323238]","[http://www.oxforddnb.com/view/article/46833, http://en.wikipedia.org/wiki/Ekco]",[Q5323238]
12187,New Zealand Government Railways,"The New Zealand Railways Department or New Zealand Government Railways, was a government department charged with owning and maintaining New Zealand's rail infrastructure, and operating the railway system. The Department was created in 1880 and in 1981 became the New Zealand Railways Corporation.",[Q538101],[http://en.wikipedia.org/wiki/New_Zealand_Railways_Department],[Q538101]
13735,J Lyons and Company Limited,"1894-1960s - Operated Lyons tea shops. Acquired many large American companies after WWII, including Baskin Robbins and Wimpy. However, the company broke up in the 1980s, and eventually folded completely in 1994;",[Q6106384],"[http://en.wikipedia.org/wiki/J._Lyons_and_Co., http://www.kzwp.com/lyons/]",[Q6106384]
4784,E Merck,MERCK: http://www.merckgroup.com/en/company/history/history.html; WIKI: https://en.wikipedia.org/wiki/Merck_Group,[Q156959],"[http://www.merckgroup.com/en/company/history/history.html, https://en.wikipedia.org/wiki/Merck_Group]",[Q156959]
944,West Japan Railways,"West Japan Railway Company, also referred to as JR West, is one of the Japan Railways Group (JR Group) companies and operates in western Honshū. It was founded in 1987 as part of the Japanese National Railway privatisation reforms.",[Q502125],"[http://en.wikipedia.org/wiki/West_Japan_Railway_Company, http://www.westjr.co.jp/english/company/]",[Q502125]
10813,Ward Lock & Company Limited,"Founded in 1854 by Ebenezer Ward and George Lock as Ward Lock. The company was based in Fleet Street. 1878 it moved to Salisbury square. James Bowden joined the company in the 1880s and the comapny became Ward, Lock and Bowden Company from 1891 to 1893 then Ward Lock and Bowden Ltd. from 1893 to 1897. In 1897 it became Ward Lock & Co. Ltd. This company is still active as an imprint in Penguin books.",[Q2100540],[http://en.wikipedia.org/wiki/Ward_Lock_&_Co],[Q2100540]
15552,Linea Aeropostal Venezuela,"1929 - the French company Aeropostale (known as Lignes Aeriennes Latécoère until 1927), then under the leadership of its owner Marcel Bouilloux-Lafont, arrived in Venezuela. 1933 - the Venezuelan government purchased the airline from the French. 1935 - name was changed to Línea Aeropostal Venezolana. 1937 - the government of Venezuela secured full ownership of the airline. 1939 - headquarters were moved from Maracay to Maiquetía because of its proximity to Caracas. 1945 - first international flights began. 1947 - introduced Lockheed Constellations to fly a new direct international route from Caracas to New York. 1961 - Douglas DC-8 jets were introduced to replace the Super Constellations. 1994 - commercial operations ceased, as part of a government effort to trim expenses. 1996 - company sold to the Corporacion Alas de Venezuela. 1997 - operations restarted as a private company. 2008 - sold to a group of entrepreneurs from the state of Carabobo, Venezuela. 2009 - the Venezuelan government announced its intention to nationalise Aeropostal, following the arrest of several owners and employees in 2008 on Interpol drug trafficking warrants.",[Q1826405],[http://en.wikipedia.org/wiki/Aeropostal_Alas_de_Venezuela],[Q1826405]
14276,Corgi Motorcycle Company Limited,"1946 - Corgi founded by John Dolphin. \n27,050 mororcycles were manufactured, some exported to the US, branded as the ‘Indian Papoose’\nOctober 1954 - production ended.",[Q5170499],[https://en.wikipedia.org/wiki/Corgi_Motorcycle_Co_Ltd.],[Q5170499]
16840,Marine Steam Turbine Company Limited,"1894 - Founded by Charles Algernon Parsons. This was the company that built the turbine-powered yacht, turbine-powered yacht, 'Turbinia'. June 1897 - 'Turbinia' was steamed at speed through the Spithead Diamond Jubilee Royal Navy fleet review off Portsmouth. 1897 - Charles Algernon Parsons founds Parsons Marine Steam Turbine Co, which bought the rights from Marine Steam Turbine Co Ltd.",[Q197494],"[http://en.wikipedia.org/wiki/Parsons_Marine_Steam_Turbine_Company, http://www.gracesguide.co.uk/Parsons_Marine_Steam_Turbine_Co, http://wck2.companieshouse.gov.uk/]",[Q197494]
