# Complete search
Search Wikidata & Wikipedia -> rank results -> retrieve information based on config.

In [34]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../..")

import os
import re

from heritageconnector.config import config, field_mapping
from heritageconnector.disambiguation import search, retrieve

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. load data sample

In [129]:
sample_no = 10
random_state = 42

# load mimsy_people
df = pd.read_csv(os.path.join("..", config.MIMSY_PEOPLE_PATH))
for col in ['FIRSTMID_NAME', 'LASTSUFF_NAME']:
    df[col] = df[col].fillna("").astype(str)
    
df['FREETEXT'] = df['DESCRIPTION'].astype(str) + " " + df['NOTE'].astype(str)

# load people df 
people_df = df[df['GENDER'].isin(('M', 'F'))].sample(sample_no, random_state=random_state)
people_df.loc[:, 'JOINED_NAME'] = people_df['FIRSTMID_NAME'] + " " + people_df['LASTSUFF_NAME']

# load org df
org_df = df[df['GENDER'] == "N"].sample(sample_no, random_state=random_state)

## 2. run search

In [130]:
row = people_df.iloc[6]
row

LINK_ID                                                                                                                                                                                                                                           38894
PREFERRED_NAME                                                                                                                                                                                                                         Stirling, Robert
TITLE_NAME                                                                                                                                                                                                                                          NaN
FIRSTMID_NAME                                                                                                                                                                                                                                    Robert
LASTSUFF

In [131]:
search_results = search.run(text=row["JOINED_NAME"], topn=10, limit=100, instanceof_filter="Q5")
search_results

item
http://www.wikidata.org/entity/Q366209      0.111111
http://www.wikidata.org/entity/Q16063040    0.097222
http://www.wikidata.org/entity/Q7350082     0.083333
http://www.wikidata.org/entity/Q76178526    0.069444
http://www.wikidata.org/entity/Q76340733    0.055556
http://www.wikidata.org/entity/Q2907840     0.045455
http://www.wikidata.org/entity/Q5561303     0.043290
http://www.wikidata.org/entity/Q89042134    0.041667
http://www.wikidata.org/entity/Q18810054    0.041126
http://www.wikidata.org/entity/Q6077165     0.038961
Name: score, dtype: float64

## 3. Create vectors of each Wikidata object

In [132]:
field_mapping.PEOPLE

{'LINK_ID': {'type': 'index'},
 'PREFERRED_NAME': {'PID': 'label', 'RDF': '', 'type': 'str'},
 'FIRSTMID_NAME': {'PID': 'P735', 'RDF': '', 'type': 'str'},
 'LASTSUFF_NAME': {'PID': 'P734', 'RDF': '', 'type': 'str'},
 'BIRTH_DATE': {'PID': 'P569', 'RDF': '', 'type': 'date'},
 'DEATH_DATE': {'PID': 'P570', 'RDF': '', 'type': 'date'},
 'BIRTH_PLACE': {'PID': 'P19', 'RDF': '', 'type': 'place'},
 'DEATH_PLACE': {'PID': 'P20', 'RDF': '', 'type': 'place'},
 'OCCUPATION': {'PID': 'P106', 'RDF': '', 'type': 'list (str)'},
 'DESCRIPTION': {'type': 'longstr'},
 'NOTE': {'type': 'longstr'}}

In [133]:
qcode_urls = search_results.index.tolist()
qcodes = [re.findall(r"(Q\d+)", u)[0] for u in qcode_urls]

pid_mapping = {k: v['PID'] for k, v in field_mapping.PEOPLE.items() if 'PID' in v and v['PID'].startswith('P')}
pid_mapping

{'FIRSTMID_NAME': 'P735',
 'LASTSUFF_NAME': 'P734',
 'BIRTH_DATE': 'P569',
 'DEATH_DATE': 'P570',
 'BIRTH_PLACE': 'P19',
 'DEATH_PLACE': 'P20',
 'OCCUPATION': 'P106'}

In [137]:
pd.DataFrame(row).T[['JOINED_NAME', 'FREETEXT'] + list(pid_mapping.keys())]

Unnamed: 0,JOINED_NAME,FREETEXT,FIRSTMID_NAME,LASTSUFF_NAME,BIRTH_DATE,DEATH_DATE,BIRTH_PLACE,DEATH_PLACE,OCCUPATION
13380,Robert Stirling,"ODNB: Ben Marsden, ‘Stirling, Robert (1790–1878)’, Oxford Dictionary of National Biography, Oxford University Press, 2004 [http://www.oxforddnb.com/view/article/26534] Robert Stirling (1790–1878): doi:10.1093/ref:odnb/26534 nan",Robert,Stirling,1790-10-25,1878-06-06,"Methven, Perth and Kinross, Scotland, United Kingdom",,inventor; clergyman


In [136]:
res = retrieve.get_wikidata_fields(qcodes, pid_mapping)
res

Unnamed: 0,item,itemLabel,itemDescription,altLabel,FIRSTMID_NAMELabel,LASTSUFF_NAMELabel,BIRTH_DATELabel,DEATH_DATELabel,BIRTH_PLACELabel,DEATH_PLACELabel,OCCUPATIONLabel
0,http://www.wikidata.org/entity/Q16063040,Robert Stirling Hore Anderson,Australian politician,Robert Anderson,Robert,,1821-01-01T00:00:00Z,1883-10-26T00:00:00Z,,,politician
1,http://www.wikidata.org/entity/Q18810054,Alfred Stirling,Australian diplomat,,Alfred,,1902-09-08T00:00:00Z,1981-07-03T00:00:00Z,Melbourne,Melbourne,diplomat
2,http://www.wikidata.org/entity/Q2907840,Bob Stirling,English rugby union footballer,,Bob,,1919-09-04T00:00:00Z,1991-01-15T00:00:00Z,Lichfield,Halton,rugby union player
3,http://www.wikidata.org/entity/Q366209,Robert Stirling,"clergyman, engineer, inventor of the Stirling engine",,Robert,Stirling,1790-10-25T00:00:00Z,1878-06-06T00:00:00Z,Methven,Galston,"[engineer, inventor, cleric]"
4,http://www.wikidata.org/entity/Q5561303,Gilbert de Stirling,Scottish bishop,,Gilbert,,,1247-01-01T00:00:00Z,,,priest
5,http://www.wikidata.org/entity/Q6077165,Isaac Stirling,Canadian politician,,Isaac,,1866-04-08T00:00:00Z,1935-10-01T00:00:00Z,,,politician
6,http://www.wikidata.org/entity/Q7350082,Robert Stirling Newall,Scottish engineer and astronomer,,Robert,,1812-05-27T00:00:00Z,1889-04-21T00:00:00Z,,Gateshead,"[engineer, astronomer]"
7,http://www.wikidata.org/entity/Q76178526,Robert Stirling,(1892-1915),,Robert,,1892-05-24T00:00:00Z,1915-02-19T00:00:00Z,,,
8,http://www.wikidata.org/entity/Q76340733,Robert Stirling,(1792-1860),,Robert,,1792-10-19T00:00:00Z,1860-12-09T00:00:00Z,,,
9,http://www.wikidata.org/entity/Q89042134,Robert Stirling,researcher,,Robert,,,,,,
