# Reconciliation v2: using text search

In [5]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import os

from heritageconnector.config import config, field_mapping
from heritageconnector.disambiguation.search import wikidata_text_search
from heritageconnector.utils.wikidata import url_to_qid
from heritageconnector.utils.data_transformation import transform_series_str_to_list
from heritageconnector.entity_matching.reconciler import reconciler

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

from collections import Counter
from tqdm import tqdm

tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. load data sample

In [6]:
sample_no = 100
random_state = 42

# load mimsy_people
df = pd.read_csv(config.MIMSY_PEOPLE_PATH)
for col in ['FIRSTMID_NAME', 'LASTSUFF_NAME']:
    df[col] = df[col].fillna("").astype(str)
    
df['FREETEXT'] = df['DESCRIPTION'].astype(str) + " " + df['NOTE'].astype(str)

# load people df 
people_df = df[df['GENDER'].isin(('M', 'F'))]#.sample(sample_no, random_state=random_state)
people_df.loc[:, 'JOINED_NAME'] = people_df['FIRSTMID_NAME'] + " " + people_df['LASTSUFF_NAME']


## 2. get subject items from PID
We need to get types of entities that populate the `OCCUPATION` field in Wikidata.

In [8]:
qcode_filter = reconciler.get_subject_items_from_pid(field_mapping.PEOPLE['OCCUPATION']['PID'])
qcode_filter
# qcodes for profession, occupation

['Q28640', 'Q12737077']

## 3. get matches for one occupation

In [9]:
search = wikidata_text_search()
search.run_search("captain", instanceof_filter=qcode_filter, include_class_tree=True)

Unnamed: 0,rank,item,itemLabel,score
0,1,http://www.wikidata.org/entity/Q163500,captain,0.4
1,2,http://www.wikidata.org/entity/Q715772,captain,0.3
2,3,http://www.wikidata.org/entity/Q5036514,captain,0.2
3,4,http://www.wikidata.org/entity/Q1146304,pilot in command,0.1
4,5,http://www.wikidata.org/entity/Q849424,ship captain,0.0


In [10]:
# not an occupation
search.run_search("conjoined twin", instanceof_filter=qcode_filter, include_class_tree=True)

## 4. get matches for all occupations
This will become a job in `smg_jobs`.

### 4.1. create mapping table

In [11]:
def str_col_to_list(series, separator=";"):
    return series.fillna("").astype(str).apply(lambda i: [x.strip().lower() for x in i.split(separator)])

people_df['OCCUPATION_list'] = str_col_to_list(people_df['OCCUPATION'])

all_names = people_df['OCCUPATION_list'].sum()
all_names = [i for i in all_names if i != ""]
series_count = pd.Series(Counter(all_names)).sort_values(ascending=False)

print(len(series_count))
series_count.head()

2216


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


artist           1097
photographer      918
physician         605
inventor          579
poster artist     569
dtype: int64

In [12]:
map_df = pd.DataFrame(series_count).rename(columns={0: 'count'})
map_df.head()

Unnamed: 0,count
artist,1097
photographer,918
physician,605
inventor,579
poster artist,569


In [56]:
def lookup_value(text):
    res_df = search.run_search(text, instanceof_filter=qcode_filter, include_class_tree=True)
    if len(res_df) == 0:
        return []
    else:
        return [url_to_qid(i) for i in res_df['item'].tolist()]

map_df['qid'] = map_df.index.to_series().progress_apply(lookup_value)

100%|██████████| 82/82 [00:57<00:00,  1.42it/s]


In [57]:
map_df

Unnamed: 0,count,qid
poster artist,9,[Q739437]
photographer,8,"[Q7187777, Q33231]"
artist,7,"[Q483501, Q706364, Q1320883, Q3391743, Q1797162, Q13381572, Q1630100]"
inventor,6,[Q205375]
scientist,5,[Q901]
...,...,...
apprentice,1,"[Q11353322, Q742585, Q253567]"
captain,1,"[Q163500, Q715772, Q5036514, Q1146304, Q849424]"
railway manager,1,[]
biologist,1,[Q864503]


In [49]:
map_df.loc[['scientist', 'captain'], 'qid'].values.sum()

['Q901', 'Q163500', 'Q715772', 'Q5036514', 'Q1146304', 'Q849424']

### 4.2 populate new field using mapping table

In [66]:
qid_col = "OCCUPATION" + "_qid"
list_col = "OCCUPATION_list"

people_df[qid_col] = people_df[list_col].progress_apply(lambda x: map_df.loc[x, 'qid'].values.sum() if x != [''] else [])

100%|██████████| 100/100 [00:00<00:00, 1532.23it/s]


In [52]:
people_df

Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,NOTE,BIRTH_DATE,BIRTH_PLACE,DEATH_DATE,DEATH_PLACE,CAUSE_OF_DEATH,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE,FREETEXT,JOINED_NAME,OCCUPATION_list,OCCUPATION_qid
1055,30775,"Dunlop, Gilbert",,Gilbert,Dunlop,,,M,N,,,,,,,,British,poster artist; illustrator,,,,,,N,14-FEB-01,19-MAR-10,nan nan,Gilbert Dunlop,"[poster artist, illustrator]","[Q739437, Q644687, Q1114448, Q3148760, Q19507792, Q15296811, Q27943388, Q60558844]"
7730,100294,"T, F",,F,T,,,M,Y,,,,,,,,French,medallist,,,,,,N,06-JAN-05,23-JUL-15,nan nan,F T,[medallist],[Q1708232]
16648,138234,"Watt, Brandon",,,"Watt, Brandon",,,M,N,,,,,,,,,,http://www.woi3d.com/thingiverse3,,,,,Y,20-JUN-13,03-OCT-17,nan nan,"Watt, Brandon",[],[]
9785,116857,"Cipriani, Galgano",,Galgano,Cipriani,,,M,Y,object records 1923 -667 & 1923-668,,,,,,,Italian,maker of telescope replica,,,,,,N,30-JAN-07,09-SEP-10,object records 1923 -667 & 1923-668 nan,Galgano Cipriani,[maker of telescope replica],[]
6500,92187,"Bodonius, Giambattista",,Giambattista,Bodonius,,,M,Y,WIKI:,,1740-02-16,Saluzzo,1813-11-29,"Parma, Parma province, Emilia-Romagna, Italy",,Italian,typographer; type-designer; printer; publisher,,,,,,N,06-JAN-05,23-JUL-15,WIKI: nan,Giambattista Bodonius,"[typographer, type-designer, printer, publisher]","[Q1229025, Q354034, Q40881196, Q175151, Q2516866, Q7258095]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11947,93651,"Dieulafoy, Paul Georges",,Paul Georges,Dieulafoy,,,M,Y,"http://cancerweb.ncl.ac.uk/; http://en.wikipedia.org/wiki/Paul_Georges_Dieulafoy; A Bibliography of Medical and Biomeidcal Biography http://books.google.co.uk/books?id=EM-UsLI0P4IC&pg=PA279&dq=o%27dwyer+1841-1898&as_brr=3&sig=XIl6i4o2NpkSy5nxCLbKwqvayZM#PPA96,M1","Physician best known for his study of acute appendicitis and his description of Dieulafoy's lesion, a rare cause of gastric bleeding. c.f. Dieulafoy's erosion; Dieulafoy's theory",1839,"Toulouse, Haute-Garonne, Midi-Pyrénées, France",1911,"Paris, Ville de Paris, Île-de-France, France",,French,physician; surgeon; pathologist,,,,,,N,06-JAN-05,14-DEC-15,"http://cancerweb.ncl.ac.uk/; http://en.wikipedia.org/wiki/Paul_Georges_Dieulafoy; A Bibliography of Medical and Biomeidcal Biography http://books.google.co.uk/books?id=EM-UsLI0P4IC&pg=PA279&dq=o%27dwyer+1841-1898&as_brr=3&sig=XIl6i4o2NpkSy5nxCLbKwqvayZM#PPA96,M1 Physician best known for his study of acute appendicitis and his description of Dieulafoy's lesion, a rare cause of gastric bleeding. c.f. Dieulafoy's erosion; Dieulafoy's theory",Paul Georges Dieulafoy,"[physician, surgeon, pathologist]","[Q39631, Q551835, Q15924224, Q7646178, Q774306, Q1360697, Q781850, Q3368718]"
499,2314,"Gilfillan, Tom",,Tom,Gilfillan,,,M,N,NRM Pictorial Coll,,,,,,,British,painter; poster artist,,,,,,N,21-AUG-96,08-MAR-11,NRM Pictorial Coll nan,Tom Gilfillan,"[painter, poster artist]","[Q288728, Q94320712, Q1028181, Q15301511, Q1630100, Q739437]"
13843,128523,"Wood, Edward George",,Edward George,Wood,,,M,Y,"Webster database – WOOD, EDWARD GEORGE [http://historydb.adlerplanetarium.org/signatures/]","Traded at 7 Shepperton Street, New North Road & 74 Cheapside (1851-93), both in London, England.",,,,,,English; British,optical & philosophical instrument maker,,,,,,N,05-JAN-10,25-OCT-11,"Webster database – WOOD, EDWARD GEORGE [http://historydb.adlerplanetarium.org/signatures/] Traded at 7 Shepperton Street, New North Road & 74 Cheapside (1851-93), both in London, England.",Edward George Wood,[optical & philosophical instrument maker],[]
5009,8041,"Short, J",,J,Short,,,M,Y,,"2 Gladstone Street, Southwark, London",,,,,,British,,,,,,,N,31-DEC-97,11-NOV-12,"nan 2 Gladstone Street, Southwark, London",J Short,[],[]


## 5. Heritage Connector implementation

In [20]:
people_df_processed = people_df.copy()
people_df_processed['OCCUPATION'] = transform_series_str_to_list(people_df_processed['OCCUPATION'], separator=";")
people_df_processed.head(2)

Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,NOTE,BIRTH_DATE,BIRTH_PLACE,DEATH_DATE,DEATH_PLACE,CAUSE_OF_DEATH,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE,FREETEXT,JOINED_NAME,OCCUPATION_list
1,10245,"Zenthon, Edward Rupert",,Edward Rupert,Zenthon,,,M,Y,REF: http://www.iwm.org.uk/collections/item/object/1030031461,,1920-07,"London, Greater London, England, United Kingdom",c. 2002,,,British,[engineer],,,,,,N,28-JAN-98,05-AUG-15,REF: http://www.iwm.org.uk/collections/item/object/1030031461 nan,Edward Rupert Zenthon,[engineer]
2,10269,"Troughton, John",,John,Troughton,,,M,Y,"1739 - Born in Corney, Cumbria, England; Apprenticed to his Uncle John Troughton \n1764 - traded at Surrey St., Strand, London \n1768-71 - traded at Crown Court, Fleet St., London\n1771-78 - traded at 17 Dean St., Fetter Lane, London \n1778-82 - traded at 1 Queen's Sq., Bartholomew Close, London \n1782 - purchased the business of Benjamin Cole \n1782-1788 - traded at the sign of the Orrery, 136 Fleet St, London, England. \n1788-1804 - in partnership as J & E Troughton, with brother Edward Troughton (1756-1835)","ODNB: Anita McConnell, ‘Troughton, Edward (1753–1835)’, Oxford Dictionary of National Biography, Oxford University Press, 2004; online edn, May 2005 [http://www.oxforddnb.com/view/article/27767]\nREF: A. McConnell, Instrument makers to the world: a history of Cooke, Troughton & Simms (1992) · A. W. Skempton and J. Brown, ‘John and Edward Troughton’, Notes and Records of the Royal Society, 27 (1972–3), 233–62",1739,"Broughton in Furness, Cumbria, England, United Kingdom",1807,"London, Greater London, England, United Kingdom",,English; British,[mathematical instrument maker],,,,,,N,28-JAN-98,06-NOV-18,"1739 - Born in Corney, Cumbria, England; Apprenticed to his Uncle John Troughton \n1764 - traded at Surrey St., Strand, London \n1768-71 - traded at Crown Court, Fleet St., London\n1771-78 - traded at 17 Dean St., Fetter Lane, London \n1778-82 - traded at 1 Queen's Sq., Bartholomew Close, London \n1782 - purchased the business of Benjamin Cole \n1782-1788 - traded at the sign of the Orrery, 136 Fleet St, London, England. \n1788-1804 - in partnership as J & E Troughton, with brother Edward Troughton (1756-1835) ODNB: Anita McConnell, ‘Troughton, Edward (1753–1835)’, Oxford Dictionary of National Biography, Oxford University Press, 2004; online edn, May 2005 [http://www.oxforddnb.com/view/article/27767]\nREF: A. McConnell, Instrument makers to the world: a history of Cooke, Troughton & Simms (1992) · A. W. Skempton and J. Brown, ‘John and Edward Troughton’, Notes and Records of the Royal Society, 27 (1972–3), 233–62",John Troughton,[mathematical instrument maker]


In [42]:
rec = reconciler(people_df_processed, table='people')
#people_df_processed['OCCUPATION_qids'] = 
map_df = rec.process_column('OCCUPATION', multiple_vals=True)
map_df

  0%|          | 0/2216 [00:00<?, ?it/s]

Looking up Wikidata qcodes for unique items..


100%|██████████| 2216/2216 [18:05<00:00,  2.04it/s]
100%|██████████| 10352/10352 [00:04<00:00, 2172.56it/s]


1                                                                          [Q151197, Q81096]
2                                                                                [Q66060335]
3                                                                         [Q7187777, Q33231]
4                                                                                         []
5                                                                                  [Q205375]
                                                ...                                         
18090                                                                    [Q157798, Q2700922]
18091                                                                    [Q157798, Q2700922]
18092    [Q482980, Q15296811, Q36180, Q901, Q947873, Q2722764, Q270389, Q1371925, Q13590141]
18093                  [Q483501, Q706364, Q1320883, Q3391743, Q1797162, Q13381572, Q1630100]
18094                          [Q7042855, Q589298, Q1607826, Q3024627,