# Implementing Search in the Heritage Connector

In [97]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../..")

from heritageconnector.disambiguation import search

import pandas as pd
pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Wikidata search

In [87]:
wd_search = search.wikidata_text_search()

property_filters = {
    "P106": "Q39631", # occupation: physician 
}

# without property filter
wd_res = wd_search.run_search("John Snow", instanceof_filter="Q5", include_class_tree=True)
wd_res

Unnamed: 0,rank,item,itemLabel,score
0,1,http://www.wikidata.org/entity/Q356407,John Snow,0.133333
1,2,http://www.wikidata.org/entity/Q1701785,John Snowden,0.12381
2,3,http://www.wikidata.org/entity/Q3526167,John Snow,0.114286
3,4,http://www.wikidata.org/entity/Q6258450,John Snowdon Henry,0.104762
4,5,http://www.wikidata.org/entity/Q91260459,John Snowdon,0.095238
5,6,http://www.wikidata.org/entity/Q18530243,John Cecil,0.085714
6,7,http://www.wikidata.org/entity/Q53343743,John Snow,0.07619
7,8,http://www.wikidata.org/entity/Q75346361,John Snowdon Henry,0.066667
8,9,http://www.wikidata.org/entity/Q75600319,John Snow,0.057143
9,10,http://www.wikidata.org/entity/Q75631594,John Snow,0.047619


In [38]:
# with property filter
wd_search.run_search("John Snow", instanceof_filter="Q5", include_class_tree=True, property_filters=property_filters)

Unnamed: 0,rank,item,itemLabel,score
0,1,http://www.wikidata.org/entity/Q356407,John Snow,1.0


### Wikipedia search
Results are ranked by a custom fuzzy search method, which:
1. excludes any text in brackets from the target string
2. sets the match score to 0 if there are no tokens in common between the source and target strings
3. calculates Levenshtein distance (`fuzz.token_set_ratio`)

In [86]:
wk_search = search.wikipedia_text_search()
wk_res = wk_search.run_search("John Snow", instanceof_filter="Q5", limit=100)
wk_res

Unnamed: 0,rank,item,wikipedia_title,score
0,1,http://www.wikidata.org/entity/Q3526167,John Snow (cricketer),0.033898
1,2,http://www.wikidata.org/entity/Q6241280,John J. Snow Jr.,0.033314
2,3,http://www.wikidata.org/entity/Q20737768,John James Snow Jr.,0.032729
3,4,http://www.wikidata.org/entity/Q356407,John Snow,0.032145
4,5,http://www.wikidata.org/entity/Q449689,John W. Snow,0.03156
5,6,http://www.wikidata.org/entity/Q648366,Hank Snow,0.030976
6,7,http://www.wikidata.org/entity/Q15999980,Henry Snow,0.030392
7,8,http://www.wikidata.org/entity/Q7117330,P. J. Snow,0.029807
8,9,http://www.wikidata.org/entity/Q5442677,Felton Snow,0.029223
9,10,http://www.wikidata.org/entity/Q357301,John Newton,0.028638


In [84]:
# with property filter
wd_search.run_search("John Snow", instanceof_filter="Q5", include_class_tree=True, property_filters=property_filters)

Unnamed: 0,rank,item,itemLabel,score
0,1,http://www.wikidata.org/entity/Q356407,John Snow,1.0


### Combining results
Taking a mean of the scores.

In [112]:
search.combine_results([wk_res, wd_res], topn=20)

item
http://www.wikidata.org/entity/Q356407      0.082739
http://www.wikidata.org/entity/Q3526167     0.074092
http://www.wikidata.org/entity/Q1701785     0.061905
http://www.wikidata.org/entity/Q6258450     0.052381
http://www.wikidata.org/entity/Q91260459    0.047619
http://www.wikidata.org/entity/Q18530243    0.042857
http://www.wikidata.org/entity/Q53343743    0.038095
http://www.wikidata.org/entity/Q75346361    0.033333
http://www.wikidata.org/entity/Q75600319    0.028571
http://www.wikidata.org/entity/Q75631594    0.023810
http://www.wikidata.org/entity/Q76007571    0.019048
http://www.wikidata.org/entity/Q6241280     0.016657
http://www.wikidata.org/entity/Q20737768    0.016365
http://www.wikidata.org/entity/Q449689      0.015780
http://www.wikidata.org/entity/Q648366      0.015488
http://www.wikidata.org/entity/Q15999980    0.015196
http://www.wikidata.org/entity/Q7117330     0.014904
http://www.wikidata.org/entity/Q5442677     0.014611
http://www.wikidata.org/entity/Q357301   

### Testing with Objects
Objects tend to have longer, more complex names that are missed off by Wikidata search and the OpenRefine Wikidata Reconciliation Service. 

In [113]:
# wikipedia
wk_search.run_search("Nokia 'N-Gage' mobile telephone, 2004-2007")

Unnamed: 0,rank,item,wikipedia_title,score
0,1,http://www.wikidata.org/entity/Q1418,Nokia,1.0
1,2,http://www.wikidata.org/entity/Q336434,N-Gage (device),0.0


In [114]:
# wikidata
wd_search.run_search("Nokia 'N-Gage' mobile telephone, 2004-2007")

In [116]:
# combined
search.run("Nokia 'N-Gage' mobile telephone, 2004-2007", topn=20)

item
http://www.wikidata.org/entity/Q1418      0.5
http://www.wikidata.org/entity/Q336434    0.0
Name: score, dtype: float64