# Analysing Unlinked Entities

In [100]:
import sys
sys.path.append("../../..")

from heritageconnector.utils.sparql import get_sparql_results
from heritageconnector.datastore import es

import pandas as pd
from IPython.display import display

# Wikifier
import requests
from urllib.parse import quote
import json

# analysing description samples (s3)
from tqdm.auto import tqdm
from collections import Counter
from heritageconnector.utils.generic import flatten_list_of_lists

In [3]:
endpoint = "http://63.33.68.17:3030/heritage-connector/sparql"

In [21]:
query = """
PREFIX hc: <http://www.heritageconnector.org/RDF/>

SELECT ?entityName ?p (COUNT(?entityName) AS ?nameCount) WHERE {
  {SELECT DISTINCT ?p WHERE{
     FILTER(STRSTARTS(STR(?p), "http://www.heritageconnector.org/RDF/entity")).
     FILTER(!STRENDS(STR(?p), "DATE")).
     FILTER(!STRENDS(STR(?p), "LANGUAGE")).
     FILTER(!STRENDS(STR(?p), "NORP")).
     ?s ?p ?o.
    }
  }
  ?subject ?p ?entityName.
  FILTER(!STRSTARTS(STR(?entityName), "http")).

} GROUP BY ?entityName ?p ORDER BY DESC(?nameCount) LIMIT 500
"""

def run_query(query):
    query_res_df = pd.json_normalize(get_sparql_results(endpoint, query)['results']['bindings'])
    query_res_df = query_res_df[[col for col in query_res_df.columns if col.endswith("value")]]
    query_res_df = query_res_df.rename(columns=lambda i: i.replace(".value", ""))
    
    return query_res_df
     
res = run_query(query)
res['p'] = res['p'].apply(lambda i: i.replace("http://www.heritageconnector.org/RDF/", ""))
res['nameCount'] = res['nameCount'].astype(int)

res

Unnamed: 0,entityName,p,nameCount
0,London,entityLOC,17315
1,England,entityLOC,8809
2,Glass,entityPERSON,7178
3,John Pollock & Co.,entityORG,4008
4,Bronze,entityPERSON,3068
...,...,...,...
495,Siebe Gorman,entityPERSON,71
496,Sons Ltd.,entityORG,71
497,Uganda,entityLOC,71
498,the Common Cold Research Unit,entityEVENT,71


## 1. Get Top 30 Unlinked Entities for each type

In [22]:
n = 30

for p in res.p.unique():
    display(res[res['p'] == p].sort_values('nameCount', ascending=False).head(n))

Unnamed: 0,entityName,p,nameCount
0,London,entityLOC,17315
1,England,entityLOC,8809
5,"London, England",entityLOC,2863
7,Paris,entityLOC,2507
8,Germany,entityLOC,1759
9,France,entityLOC,1744
11,Sheffield,entityLOC,1685
12,Birmingham,entityLOC,1677
13,Manchester,entityLOC,1584
14,Europe,entityLOC,1537


Unnamed: 0,entityName,p,nameCount
2,Glass,entityPERSON,7178
4,Bronze,entityPERSON,3068
6,Bottle,entityPERSON,2516
19,Sarah Wilson,entityPERSON,910
28,Roman,entityPERSON,694
42,Eastman Kodak,entityPERSON,492
61,Patrick Stephens Limited,entityPERSON,390
64,Mayer,entityPERSON,381
65,Wellingborough,entityPERSON,378
67,Laurence Olivier,entityPERSON,354


Unnamed: 0,entityName,p,nameCount
3,John Pollock & Co.,entityORG,4008
16,London Midland & Scottish Railway,entityORG,1150
31,Lithograph,entityORG,638
35,the Colour Museum,entityORG,599
41,Manchester Sheffield & Lincolnshire Railway,entityORG,501
45,Lancet,entityORG,468
48,the Russian Imperial Royal Family,entityORG,464
50,Elkington & Co,entityORG,459
51,Ferranti Ltd,entityORG,439
52,North Eastern Railway,entityORG,436


Unnamed: 0,entityName,p,nameCount
10,Roman,entityOBJECT,1742
162,BR35014,entityOBJECT,169
195,Lomb,entityOBJECT,147
458,The Non-Conformists',entityOBJECT,76
468,Dimensions,entityOBJECT,74


Unnamed: 0,entityName,p,nameCount
17,Southern Railway,entityFAC,972
24,Great Western Railway,entityFAC,720
83,Great Northern Railway,entityFAC,269
100,Birmingham Railway,entityFAC,236
108,Midland Railway,entityFAC,224
120,Great Central Railway,entityFAC,212
129,Fleet Street,entityFAC,204
146,Manchester Railway,entityFAC,186
158,Great Eastern Railway,entityFAC,172
204,Belgrave Square,entityFAC,142


Unnamed: 0,entityName,p,nameCount
227,ICI Photography Awards (European Section,entityEVENT,134
258,the First World War,entityEVENT,122
354,World War II,entityEVENT,94
387,the Herschel Album,entityEVENT,88
397,the Second World War,entityEVENT,86
400,ICI Photography Awards,entityEVENT,85
498,the Common Cold Research Unit,entityEVENT,71


## 2. Try Wikifier for linking these entities

In [24]:
wikifier_endpoint = "http://www.wikifier.org/annotate-article?"
wikifier_userkey = "znmgsoqelkdxlhpznnyfplptrrjxnk"

In [124]:
def get_wikifier_results(text, ent_text_filter: str = None, min_pagerank = 0.015, min_linkprob = 0.8):

    payload=f'userKey={wikifier_userkey}&text={quote(text)}&lang=en&pageRankSqThreshold=0.015&applyPageRankSqThreshold=true&nWordsToIgnoreFromList=200&nTopDfValuesToIgnore=200'
    headers = {
      'Content-Type': 'application/x-www-form-urlencoded'
    }

    response = requests.request("POST", wikifier_endpoint, headers=headers, data=payload).json()['ranges']
    
    filtered_wikifier_res = []
    
    for item in response:
        for candidate in item['candidates']:
            if candidate['pageRank'] >= min_pagerank and candidate['linkProb'] >= min_linkprob:# and candidate['title'].lower() in text.lower():
                candidate.update({"wFrom": item["wFrom"], "wTo": item["wTo"], "wordsUsed": item["wordsUsed"]})
                filtered_wikifier_res.append(candidate)
    
    # ent_text_filter only keeps Wikipedia pages with titles with at least one token in common to the filter
    if ent_text_filter is not None:
        filtered_wikifier_res = [item for item in filtered_wikifier_res if set([i.lower() for i in item['wordsUsed']]).intersection(set(ent_text_filter.lower().split(" ")))]
                
    return filtered_wikifier_res

# text = "Bowman's suction curette, for soft cataract, plated metal and glass, on leather case, by John Weiss and Son, 287 Oxford Street, London, England, 1901-1930."
text = "First World War dressing, treated with cyanide, German"

wikifier_res = get_wikifier_results(text)

wikifier_res


[{'title': 'World War I',
  'url': 'http://en.wikipedia.org/wiki/World_War_I',
  'cosine': 0.1618937172520808,
  'linkCount': 25623,
  'pageRank': 0.06862261463189895,
  'prbConfidence': 0.3274557393324211,
  'linPr': 1,
  'linkProb': 0.9960349854227405,
  'logLinkProb': 0.9996438531377602,
  'score2': 0.06862261463189895,
  'isBestCand': True,
  'wFrom': 0,
  'wTo': 2,
  'wordsUsed': ['First', 'World', 'War']},
 {'title': 'Cyanide',
  'url': 'http://en.wikipedia.org/wiki/Cyanide',
  'cosine': 0.5082312238087592,
  'linkCount': 668,
  'pageRank': 0.03730919440121017,
  'prbConfidence': 0.4171264401628262,
  'linPr': 0.9913793103448276,
  'linkProb': 0.9421720733427362,
  'logLinkProb': 0.9921247368109379,
  'score2': 0.03730919440121017,
  'isBestCand': True,
  'wFrom': 7,
  'wTo': 7,
  'wordsUsed': ['cyanide']}]

## 3. Sample descriptions for unlinked entities and test Wikifier results
- how accurate are they?
- how homogeneous are they?

In [103]:
def get_descriptions(text: str, entity_predicate: str, n: int, random_seed: int = 42):
    query = {
        "query": { 
            "function_score": {
                "query": {
                    "match": {
                        f"graph.@hc:{entity_predicate}.@value.keyword": {
                            "query": text 
                        }
                    }
                },
                "random_score": {"seed": random_seed, "field": "_seq_no"},
            }
        }
    }
    
    res = es.search(
        index='heritageconnector',
        body=query,
        size=n
    )['hits']['hits']
    
    descriptions = [item['_source']['data']['http://www.w3.org/2001/XMLSchema#description'] for item in res]
    
    return descriptions

descriptions = get_descriptions("London", "entityLOC", 100)

len(descriptions), descriptions[0]


(100,
 'Brass Gregorian reflecting telescope of 2 1/2-inch aperture by James Short, London [244/1063=9.6], on an altazimuth mounting with box foot stand made from telescope case containing 2 eyepieces, brass mirror box and accessories.')

In [112]:
def get_url_count_for_entity_mention(mention_text: str, entity_predicate: str, no_per_mention: str, random_seed: int = 42):
    mention_descriptions = get_descriptions(mention_text, entity_predicate, no_per_mention)
    
    mention_urls = {}

    for desc in mention_descriptions:
        wikifier_res = get_wikifier_results(desc, ent_text_filter=mention_text)

        if len(wikifier_res) > 0:
            mention_urls.update({desc: list(set([item['url'] for item in wikifier_res]))})
    
    return Counter(flatten_list_of_lists(mention_urls.values()))

get_url_count_for_entity_mention("London", "entityLOC", 5)

Counter({'http://en.wikipedia.org/wiki/London': 4})

In [125]:
top_n_per_predicate = 10

wikifier_counts_data = {}

for p in res.p.unique():
    print(f"---{p}---")
    wikifier_counts_data[p] = {}
    
    prop_df = res[res['p'] == p].sort_values('nameCount', ascending=False).head(top_n_per_predicate)
    
    for _, row in tqdm(prop_df.iterrows(), total=len(prop_df)):
        mention_count = get_url_count_for_entity_mention(row['entityName'], row['p'], no_per_mention=25)
        wikifier_counts_data[p][row["entityName"]] = mention_count

---entityLOC---


  0%|          | 0/10 [00:00<?, ?it/s]

---entityPERSON---


  0%|          | 0/10 [00:00<?, ?it/s]

---entityORG---


  0%|          | 0/10 [00:00<?, ?it/s]

---entityOBJECT---


  0%|          | 0/5 [00:00<?, ?it/s]

---entityFAC---


  0%|          | 0/10 [00:00<?, ?it/s]

---entityEVENT---


  0%|          | 0/7 [00:00<?, ?it/s]

In [126]:
wikifier_counts_data

{'entityLOC': {'London': Counter({'http://en.wikipedia.org/wiki/London': 19,
           'http://en.wikipedia.org/wiki/London_and_North_Eastern_Railway': 1,
           'http://en.wikipedia.org/wiki/London_North_Eastern_Railway': 1,
           'http://en.wikipedia.org/wiki/University_College_London': 1,
           'http://en.wikipedia.org/wiki/Imperial_College_London': 1,
           'http://en.wikipedia.org/wiki/Grosvenor_Square': 1,
           'http://en.wikipedia.org/wiki/London_and_Birmingham_Railway': 1,
           'http://en.wikipedia.org/wiki/%2elondon': 1,
           'http://en.wikipedia.org/wiki/Strand,_London': 1}),
  'England': Counter({'http://en.wikipedia.org/wiki/England': 23,
           'http://en.wikipedia.org/wiki/Middlesex': 1,
           'http://en.wikipedia.org/wiki/London': 3,
           'http://en.wikipedia.org/wiki/Manchester': 6,
           'http://en.wikipedia.org/wiki/Warwickshire': 1,
           'http://en.wikipedia.org/wiki/Wembley': 1}),
  'London, England': C

In [128]:
import json

# with open('unlinked entities analysis.json', 'w') as f:
#     json.dump(wikifier_counts_data, f)

In [131]:
json.dumps({
    "text": "text_doc",
    "spans": [(41, 16)]
}
)

'{"text": "text_doc", "spans": [[41, 16]]}'