# Find Similar Entities

We have now returned all of the entities we'll get from direct querying of the package. We must now use alternate measures to identify candidate entities and select from that pool.

#### Import Packages

In [1]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

In [2]:
# Package
from wikipedia2vec import Wikipedia2Vec

# Class to compare type
from wikipedia2vec.dictionary import Entity

In [10]:
%%time
# Load unzipped pkl file with word embeddings
w2v = Wikipedia2Vec.load("../../embeddings/enwiki_20180420_100d.pkl")

CPU times: user 97.8 ms, sys: 181 ms, total: 279 ms
Wall time: 463 ms


## Load previously made predictions

In [5]:
# Base path to input
preds_path = '../../predictions/'

# Load data
entity_disambiguation = pd.read_csv(os.path.join(preds_path, "wikipedia2vec_getentities.csv"), delimiter=",")
entity_disambiguation.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",
1,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,0,0,"['EU', 'German', 'British']",
2,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
3,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
4,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
5,B,German,http://en.wikipedia.org/wiki/Germany,11867,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
6,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
7,B,Germany,http://en.wikipedia.org/wiki/Germany,11867,2,0,"['Germany', 'European Union', 'Werner Zwingman...",Germany
8,B,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union
9,I,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union


In [7]:
# Print current status for missing estimate count
print("Mentions without Prediction: ",\
      round(entity_disambiguation['preds_w2v_getentity'].isnull().sum()/len(entity_disambiguation)*100,3),"%")

Mentions without Prediction:  27.362 %


## Find most similar entity using Wikipedia2vec

We now turn to using a variation on Wikipedia2vec's `most_similar()` function to find, for entered words, the most similar entity. We do this as an added-layer, meaning only for those without an estimate, and for all full mentions, to compare performance.

In [15]:
# Normalize full_mentions to lower case for entry into most_similar() function
full_mention_norm = np.array([x.lower() for x in entity_disambiguation['full_mention']])
entity_disambiguation['full_mention_norm'] = full_mention_norm
entity_disambiguation.head(1)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity,full_mention_norm
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",,german


In [16]:
### Test single full mention query time
start_time = time.time()

# Print word
search_word = entity_disambiguation['full_mention_norm'][0]
print("Search Word: ", search_word)

# Get most similar word
count_similar = 100
similar = w2v.most_similar(w2v.get_word(str(search_word)), count_similar)

# Retrieve only entities from word
entities = []
return_similar = 10
for i in similar:
#     print(type(i[0]))
    if isinstance(i[0], Entity):
        entities.append(i)
    if len(entities) == return_similar:
        break
end_time = time.time()
print(f"Single Word Query Time: {round(end_time - start_time, 2)}s")
display(entities)

Search Word:  german
Single Word Query Time: 27.12s


[(<Entity Culture of Germany>, 0.68680257),
 (<Entity 1860 in Germany>, 0.6840672),
 (<Entity 1866 in Germany>, 0.6836185),
 (<Entity Template:Focke-Wulf aircraft>, 0.68068546),
 (<Entity 1847 in Germany>, 0.680415),
 (<Entity File:Map-GermanConfederation.svg>, 0.67916125),
 (<Entity 1858 in Germany>, 0.6769093),
 (<Entity 1856 in Germany>, 0.67657095),
 (<Entity 1865 in Germany>, 0.6758945),
 (<Entity 1857 in Germany>, 0.67497927)]

In [38]:
%%time

# Prepare output array
most_similar_entities = []
most_similar_scores = []

# Track metrics
success_word_query = 0

# Provide filter ability
size = 10

for mention in tqdm(entity_disambiguation['full_mention_norm'][:size]):
    
    # Retrieve w2v word from mention
    word = w2v.get_word(str(mention))
    
    if word is not None:
        success_word_query += 1
        
        # Search most similar words/entities from found word
        # Retrieve 50 most similar to ensure at least one entity returned
        similars = w2v.most_similar(word, 50)

        # Retrieve most similar entity
        most_similar = None
        for s in similars:
            if isinstance(s[0], Entity):
                most_similar = s
            if most_similar is not None:
                break
    else:
        most_similar = (None, None)
    
    # Save most similar
    most_similar_entities.append(most_similar[0].title)
    most_similar_scores.append(most_similar[1])
    
print("Successfully Found Words: ", round(success_word_query/size*100,3),"%")

100%|██████████| 10/10 [02:40<00:00, 16.08s/it]

Successfully Found Words:  60.0 %
CPU times: user 27.2 s, sys: 39.9 s, total: 1min 7s
Wall time: 2min 40s





In [39]:
# Append to dataframe
mini_df = entity_disambiguation[:size].copy()
mini_df['preds_w2v_mostsimilar'] = most_similar_entities
mini_df['score_w2v_mostsimilar'] = most_similar_scores
mini_df.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity,full_mention_norm,preds_w2v_mostsimilar,score_w2v_mostsimilar
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",,german,<Entity Culture of Germany>,0.686803
1,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,0,0,"['EU', 'German', 'British']",,british,<Entity Russians in the United Kingdom>,0.615564
2,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,brussels,<Entity Brussels>,0.88707
