# Training Mechanism for Disambiguation
Given an item with a match, get a table of information for negative samples. **Runs from Elasticsearch, unlike previous examples**

1. Search label on Elasticsearch
2. Fetch data from Wikidata

In [1]:
!pip install elastic_wikidata



In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append("../../..")

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import elastic_wikidata
from heritageconnector import datastore

data_folder = "../../../GITIGNORE_DATA"

In [2]:
df = pd.read_pickle(data_folder + '/filtering_people_orgs_result.pkl')
df.head(2)

Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,NOTE,BIRTH_DATE,BIRTH_PLACE,DEATH_DATE,DEATH_PLACE,CAUSE_OF_DEATH,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE,res_ALL_NOTES,res_WIKIDATA_IDs,res_URLS,qcodes_filtered
1,10245,"Zenthon, Edward Rupert",,Edward Rupert,Zenthon,,,M,Y,REF: http://www.iwm.org.uk/collections/item/object/1030031461,,1920-07,"London, Greater London, England, United Kingdom",c. 2002,,,British,engineer,,,,,,N,28-JAN-98,05-AUG-15,REF: http://www.iwm.org.uk/collections/item/object/1030031461 --- nan,[],[http://www.iwm.org.uk/collections/item/object/1030031461],[]
2,10269,"Troughton, John",,John,Troughton,,,M,Y,"1739 - Born in Corney, Cumbria, England; Apprenticed to his Uncle John Troughton \n1764 - traded at Surrey St., Strand, London \n1768-71 - traded at Crown Court, Fleet St., London\n1771-78 - traded at 17 Dean St., Fetter Lane, London \n1778-82 - traded at 1 Queen's Sq., Bartholomew Close, London \n1782 - purchased the business of Benjamin Cole \n1782-1788 - traded at the sign of the Orrery, 136 Fleet St, London, England. \n1788-1804 - in partnership as J & E Troughton, with brother Edward Troughton (1756-1835)","ODNB: Anita McConnell, ‘Troughton, Edward (1753–1835)’, Oxford Dictionary of National Biography, Oxford University Press, 2004; online edn, May 2005 [http://www.oxforddnb.com/view/article/27767]\nREF: A. McConnell, Instrument makers to the world: a history of Cooke, Troughton & Simms (1992) · A. W. Skempton and J. Brown, ‘John and Edward Troughton’, Notes and Records of the Royal Society, 27 (1972–3), 233–62",1739,"Broughton in Furness, Cumbria, England, United Kingdom",1807,"London, Greater London, England, United Kingdom",,English; British,mathematical instrument maker,,,,,,N,28-JAN-98,06-NOV-18,"1739 - Born in Corney, Cumbria, England; Apprenticed to his Uncle John Troughton \n1764 - traded at Surrey St., Strand, London \n1768-71 - traded at Crown Court, Fleet St., London\n1771-78 - traded at 17 Dean St., Fetter Lane, London \n1778-82 - traded at 1 Queen's Sq., Bartholomew Close, London \n1782 - purchased the business of Benjamin Cole \n1782-1788 - traded at the sign of the Orrery, 136 Fleet St, London, England. \n1788-1804 - in partnership as J & E Troughton, with brother Edward Troughton (1756-1835) --- ODNB: Anita McConnell, ‘Troughton, Edward (1753–1835)’, Oxford Dictionary of National Biography, Oxford University Press, 2004; online edn, May 2005 [http://www.oxforddnb.com/view/article/27767]\nREF: A. McConnell, Instrument makers to the world: a history of Cooke, Troughton & Simms (1992) · A. W. Skempton and J. Brown, ‘John and Edward Troughton’, Notes and Records of the Royal Society, 27 (1972–3), 233–62",[Q1293897],[http://www.oxforddnb.com/view/article/27767],[]


## 1. Get Candidates (search label on ES)

In [3]:
from heritageconnector.disambiguation.search import es_text_search

In [4]:
ID = 10269
row = df[df['LINK_ID'] == ID]
label = row["PREFERRED_NAME"].values[0]

search = es_text_search(index='wikidump_humans')
qids = search.run_search(label, limit=10, include_aliases=True)

print(f"Matches for {label}")
qids

Matches for Troughton, John


['Q18879367',
 'Q58453056',
 'Q28179162',
 'Q559430',
 'Q3525964',
 'Q55816361',
 'Q75987735',
 'Q6834961',
 'Q1293897',
 'Q7422845']

## 2. Get Wikidata info back

### 2.1 Using `wbgetentities` / elastic_wikidata

Only returns QIDs for every value with a QID

In [6]:
from elastic_wikidata.wd_entities import get_entities, simplify_wbgetentities_result

In [7]:
wd_res = get_entities.get_all_results(qids, page_limit=20, timeout=6)


Getting 10 wikidata documents in pages of 20


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
pd.json_normalize(simplify_wbgetentities_result(wd_res, lang='en', properties=['P106', 'P569', 'P570','P21']))

Unnamed: 0,id,labels,descriptions,aliases,claims.P106,claims.P569,claims.P570,claims.P21
0,Q18879367,John Troughton,English nonconformist minister and religious controversialist,[],[Q1423891],[+1637-00-00T00:00:00Z],[+1681-00-00T00:00:00Z],[Q6581097]
1,Q58453056,Michael John Troughton,researcher,[],[Q1650915],,,[Q6581097]
2,Q28179162,Medhurst Troughton,British cricketer (1839-1912),[Medhurst Albert Troughton],[Q12299841],[+1839-12-25T00:00:00Z],[+1912-01-01T00:00:00Z],[Q6581097]
3,Q559430,Patrick Troughton,English actor (1920-1987),[Patrick George Troughton],"[Q2259451, Q10800557, Q33999, Q10798782]",[+1920-03-25T00:00:00Z],[+1987-03-28T00:00:00Z],[Q6581097]
4,Q3525964,Jim Troughton,Cricket player of England. (born 1979),[Jamie Oliver Troughton],[Q12299841],[+1979-03-02T00:00:00Z],,[Q6581097]
5,Q55816361,Charles Troughton Clark,New Zealand commercial photographer (1890-1973),[C. Troughton Clark],[Q33231],[+1890-01-01T00:00:00Z],"[+1973-00-00T00:00:00Z, +1979-00-00T00:00:00Z]",[Q6581097]
6,Q75987735,Felix Troughton,Peerage person ID=450661,[Lieutenant Felix Troughton],,,,[Q6581097]
7,Q6834961,Michael Troughton,English actor,[],"[Q33999, Q36180, Q37226, Q10798782]",[+1955-03-02T00:00:00Z],,[Q6581097]
8,Q1293897,Edward Troughton,British telescope maker,[],"[Q11063, Q205375]",[+1753-01-01T00:00:00Z],[+1835-06-12T00:00:00Z],[Q6581097]
9,Q7422845,Sarah Troughton,Lord Lieutenant of Wiltshire,[Sarah Rose Colman],,[+1953-05-03T00:00:00Z],,[Q6581072]


### 2.2 Using SPARQL
Gets values back with option to return QIDs


In [7]:
from heritageconnector.config import field_mapping
from heritageconnector.utils.wikidata import url_to_pid
from heritageconnector.disambiguation.retrieve import get_wikidata_fields

In [6]:
def get_results(ids, limit_per_id=10, label_col="PREFERRED_NAME"):
    search = es_text_search(index='wikidump_humans')
    id_qid_mapping = {}
    
    # get QIDs from ES
    for _id in ids:
        row = df[df['LINK_ID'] == _id]
        label = row[label_col].values[0]
        qids = search.run_search(label, limit=limit_per_id, include_aliases=True)
        id_qid_mapping.update({_id: qids})
    
    # get PIDs from field mapping
    person_mapping = field_mapping.mapping['PERSON']
    col_pid_mapping = {k:url_to_pid(str(v['PID'])) for (k,v) in person_mapping.items() if 'PID' in v and v['PID'] not in ('label', 'description')}
    pids = col_pid_mapping.values()
    
    # return table of results
    res = get_wikidata_fields(pids, id_qid_mapping=id_qid_mapping)
    
    return res


In [7]:
resdf = get_results([10269, 10245])
resdf.head()

KeyError: "['P735Label'] not in index"

#### 2.2.1 Fill blanks using heuristics
first name & last name

In [12]:
firstname_from_label = lambda l: l.split(" ")[0]
lastname_from_label = lambda l: l.split(" ")[-1]

for idx, row in resdf.iterrows():
    resdf.loc[idx, 'P735'] = firstname_from_label(row['itemLabel']) if not row['P735'] else row['P735']
    resdf.loc[idx, 'P734'] = lastname_from_label(row['itemLabel']) if not row['P734'] else row['P734']
    
resdf.head()

Unnamed: 0,id,item,itemLabel,itemDescription,altLabel,P735,P734,P21,P569,P570,P19,P20,P106
0,10245,Q1001190,Buddy Featherstonhaugh,English jazz saxophonist,Rupert Edward Lee Featherstonhaugh,Buddy,Featherstonhaugh,male,1909-10-04T00:00:00Z,1976-07-12T00:00:00Z,Paris,London,"[saxophonist, racing automobile driver, clarinetist, jazz musician]"
16,10245,Q75862568,Rupert Edward Bisgood,(1956-1956),,Rupert,Bisgood,male,1956-09-05T00:00:00Z,1956-09-08T00:00:00Z,,,
15,10245,Q75345427,Rupert Edward Nutting,born 1971,,Rupert,Nutting,male,1971-10-20T00:00:00Z,,,,
13,10245,Q7380311,Rupert Inglis,Rugby player & army chaplain,Rupert Edward Inglis,Rupert,Inglis,male,1863-05-17T00:00:00Z,1916-09-18T00:00:00Z,,,"[rugby union player, Anglican priest]"
11,10245,Q593618,Anton Edward Rupert,"Afrikaner South African billionaire entrepreneur, businessman and conservationist",,Anton,Rupert,male,1916-10-04T00:00:00Z,2006-01-18T00:00:00Z,Graaff-Reinet,Stellenbosch,entrepreneur


## 3. Training method

In [62]:
from elasticsearch import helpers
import rdflib
from rdflib import Graph
import json
from itertools import islice
import numpy as np
from tqdm.auto import tqdm

from heritageconnector.datastore import es
from heritageconnector.config import config
from heritageconnector.utils.wikidata import url_to_pid, url_to_qid
from heritageconnector.namespace import *
from heritageconnector.disambiguation import compare_fields as compare

search = es_text_search(index='wikidump_humans')
person_mapping = field_mapping.mapping['PERSON']

def process_wikidata_results(wikidata_results: pd.DataFrame) -> pd.DataFrame:
    """
    - fill empty firstname (P735) and lastname (P734) fields by taking the first and last words of the label field
    - convert birthdate & deathdate (P569 & P570) to years
    - add label column combining itemLabel and altLabel lists
    """
    firstname_from_label = lambda l: l.split(" ")[0]
    lastname_from_label = lambda l: l.split(" ")[-1]
    year_from_wiki_date = lambda l: l[0:4] if isinstance(l, str) else np.mean([int(i[0:4]) for i in l])
        
    for idx, row in wikidata_results.iterrows():
        wikidata_results.loc[idx, 'P735'] = firstname_from_label(row['itemLabel']) if not row['P735'] else row['P735']
        wikidata_results.loc[idx, 'P734'] = lastname_from_label(row['itemLabel']) if not row['P734'] else row['P734']
    
    wikidata_results['P569'] = wikidata_results['P569'].apply(year_from_wiki_date)
    wikidata_results['P570'] = wikidata_results['P570'].apply(year_from_wiki_date)
    
    wikidata_results['itemLabel'] = wikidata_results['itemLabel'].apply(lambda i: [i] if isinstance(i, str) else i)
    wikidata_results['altLabel'] = wikidata_results['altLabel'].apply(lambda i: [i] if isinstance(i, str) else i)
    wikidata_results['label'] = wikidata_results['itemLabel'] + wikidata_results['altLabel']
    
    return wikidata_results

def train(wd_index: str, limit=1, search_limit=20):
    filtered_mapping = {k:v for (k,v) in person_mapping.items() if (('PID' in v) or (v.get('RDF') == RDFS.label)) and ('RDF' in v) and (v.get('PID') != 'description')}
    pids_nolabel = [url_to_pid(v['PID']) for _, v in person_mapping.items() if v.get('wikidata_entity')]
    X_list = []
    y_list = []
    
    # get records with sameAs from Elasticsearch
    query = {"query": {"wildcard": {"graph.@owl:sameAs.@id.keyword": "@wd*"}}}
    search_res = helpers.scan(es, query=query, index=config.ELASTIC_SEARCH_INDEX)
    if limit:
        search_res = islice(search_res, limit)
    
    # for each record, get Wikidata results and create X: feature matrix and y: boolean vector (correct/incorrect match)
    for item in tqdm(search_res, total=limit):
        g = Graph().parse(data=json.dumps(item["_source"]["graph"]), format="json-ld")
        label = g.label(next(g.subjects()))
        qid_true = url_to_qid(next(g.objects(predicate=OWL.sameAs)))
        X_temp = []
        
        # search for Wikidata matches and retrieve information (batched)
        qids = search.run_search(label, limit=search_limit, include_aliases=True)
        pids = [url_to_pid(v['PID']) for _, v in filtered_mapping.items() if v['RDF'] != RDFS.label]
        y_list += [qid==qid_true for qid in qids]
        
        # return and process table of results
        wikidata_results_df = get_wikidata_fields(pids=pids, qids=qids, pids_nolabel=pids_nolabel)
        wikidata_results_df = process_wikidata_results(wikidata_results_df)
        
        # create vector for each record with True/False label according to whether there is a match
        for key, value in filtered_mapping.items():
            pid = 'label' if value['RDF'] == RDFS.label else url_to_pid(value['PID'])
                
            rdf = value['RDF']
            val_type = value['type']

            vals_internal = [str(i) for i in g.objects(predicate=rdf)]
            vals_wikidata = wikidata_results_df.loc[wikidata_results_df['item'].isin(qids), pid]

            if val_type == 'string':
                sim_list = [compare.similarity_string(vals_internal, i) for i in vals_wikidata]

            elif val_type == 'numeric':
                sim_list = [compare.similarity_numeric(vals_internal, i)  if str(i) != "" else 0 for i in vals_wikidata]

            elif val_type == 'categorical':
                sim_list = [compare.similarity_categorical(vals_internal, i, raise_on_diff_types=False) for i in vals_wikidata]
            
            #if val_type in ['string', 'numeric', 'categorical']:
            #    print(pid, vals_internal)
            #    print(vals_wikidata)
            #    print(sim_list)
            
            X_temp.append(sim_list)
                        
        X_item = np.asarray(X_temp, dtype=np.float32).transpose()
        X_list.append(X_item)
        
    X = np.vstack(X_list)
    y = np.asarray(y_list, dtype=bool)
    
    return X, y

In [64]:
X, y = train('wikidump_humans', limit=2, search_limit=10)
X, y

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




(array([[0.75      , 0.33      , 1.        , 1.        , 0.95879555,
         0.9544641 , 0.9544641 , 0.9544641 , 0.        ],
        [0.79      , 0.46      , 1.        , 0.        , 0.99175256,
         0.        , 0.        , 0.        , 0.        ],
        [0.75      , 0.        , 1.        , 1.        , 0.9827721 ,
         0.99044985, 0.99044985, 0.99044985, 0.        ],
        [0.82      , 0.36      , 1.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.13      , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 1.        , 0.        , 0.935346  ,
         0.94394445, 0.94394445, 0.94394445, 0.        ],
        [1.        , 1.        , 1.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.       

### 3.1 in heritage connector

In [66]:
from heritageconnector.disambiguation.pipelines import build_training_data


In [67]:
build_training_data('wikidump_humans', 'PERSON', limit=2)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




(array([[0.75      , 0.33      , 1.        , 1.        , 0.95879555,
         0.9544641 , 0.9544641 , 0.9544641 , 0.        ],
        [0.79      , 0.46      , 1.        , 0.        , 0.99175256,
         0.        , 0.        , 0.        , 0.        ],
        [0.75      , 0.        , 1.        , 1.        , 0.9827721 ,
         0.99044985, 0.99044985, 0.99044985, 0.        ],
        [0.72      , 0.18      , 0.15      , 1.        , 0.9927798 ,
         0.997003  , 0.997003  , 0.997003  , 0.        ],
        [0.82      , 0.36      , 1.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.72      , 0.29      , 0.89      , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.75      , 0.33      , 0.94      , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.75      , 0.        , 0.89      , 0.        , 0.        ,
         0.        , 0.        , 0.       