# Step Three (A): Find Similar Entities via Adapted Wikipedia2vec most_similar()

We have now returned all of the entities we'll get from direct querying of the package. We must now use alternate measures to identify candidate entities and select from that pool.

#### Import Packages

In [1]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

In [2]:
# Package
from wikipedia2vec import Wikipedia2Vec

# Class to compare type
from wikipedia2vec.dictionary import Entity

In [10]:
%%time
# Load unzipped pkl file with word embeddings
w2v = Wikipedia2Vec.load("../../embeddings/enwiki_20180420_100d.pkl")

CPU times: user 97.8 ms, sys: 181 ms, total: 279 ms
Wall time: 463 ms


## Load previously made predictions

In [99]:
# Base path to input
preds_path = '../../predictions/'

# Load data
entity_disambiguation = pd.read_csv(os.path.join(preds_path, "wikipedia2vec_getentities.csv"), delimiter=",")
entity_disambiguation.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
5,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708.0,Brussels,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
6,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
7,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
8,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
9,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",


In [103]:
# Print current status for missing estimate count
print("Mentions without Prediction: ",\
      round(entity_disambiguation['preds_w2v_getentity'].isnull().sum()/len(entity_disambiguation)*100,3),"%")
print("Non-None Mentions without Prediction: ",\
      round(len(entity_disambiguation[entity_disambiguation['wikipedia_page_ID'].notnull()\
                                  & entity_disambiguation['preds_w2v_getentity'].isnull()])\
            /len(entity_disambiguation[entity_disambiguation['wikipedia_page_ID'].notnull()])*100,3),"%")

Mentions without Prediction:  39.148 %
Non-None Mentions without Prediction:  27.362 %


## Find most similar entity using Wikipedia2vec

We now turn to using a variation on Wikipedia2vec's `most_similar()` function to find, for entered words, the most similar entity. We do this as an added-layer, meaning only for those without an estimate, and for all full mentions, to compare performance.

In [104]:
# Normalize full_mentions to lower case for entry into most_similar() function
full_mention_norm = np.array([x.lower() for x in entity_disambiguation['full_mention']])
entity_disambiguation['full_mention_norm'] = full_mention_norm
entity_disambiguation.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity,full_mention_norm
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union,eu
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",,german
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",,british


In [105]:
### Test single full mention query time
start_time = time.time()

# Print word
search_word = entity_disambiguation['full_mention_norm'][2]
print("Search Word: ", search_word)

# Get most similar word
count_similar = 100
similar = w2v.most_similar(w2v.get_word(str(search_word)), count_similar)

# Retrieve only entities from word
entities = []
return_similar = 10
for i in similar:
#     print(type(i[0]))
    if isinstance(i[0], Entity):
        entities.append(i)
    if len(entities) == return_similar:
        break
end_time = time.time()
print(f"Single Word Query Time: {round(end_time - start_time, 2)}s")
display(entities)

Search Word:  british
Single Word Query Time: 31.06s


[(<Entity Russians in the United Kingdom>, 0.6155642),
 (<Entity Henry Wood (naval officer)>, 0.60587233),
 (<Entity D.N. Penfold>, 0.6025652),
 (<Entity Christopher J. Burgess>, 0.5973172),
 (<Entity Commonwealth of Nations>, 0.59682524),
 (<Entity British Empire>, 0.59353995),
 (<Entity File:Flag of The Commonwealth.svg>, 0.59272784),
 (<Entity Dial 999 (1938 film)>, 0.592148),
 (<Entity Black British>, 0.58818066),
 (<Entity Numa François Henri Sadoul>, 0.5852482)]

In [132]:
# Prepare output array
most_similar_entities = []
most_similar_scores = []
get_similar_candidate_pool = []
get_similar_candidate_scores = []

# Track metrics
success_word_query = 0
start_time = time.time()

# Provide filter ability
size = 1000

for mention in tqdm(entity_disambiguation['full_mention_norm'][:size]):
    
    # Retrieve w2v word from mention
    word = w2v.get_word(str(mention))
    
    # Save candidate pool
    candidate_pool = []
    candidate_scores = []
    
    if word is not None:
        success_word_query += 1
        
        # Search most similar words/entities from found word
        # Retrieve 50 most similar to ensure at least one entity returned
        similars = w2v.most_similar(word, 50)

        # Retrieve most similar entity
        most_similar = None
        for s in similars:
            if isinstance(s[0], Entity):
                candidate_pool.append(s[0].title)
                candidate_scores.append(s[1])
                if most_similar is None:
                    most_similar = s
                
    # Save lists
    get_similar_candidate_pool.append(candidate_pool)
    get_similar_candidate_scores.append(candidate_scores)
    
    if most_similar is not None:
        most_similar_entities.append(most_similar[0].title)
        most_similar_scores.append(most_similar[1])
    else:
        most_similar_entities.append(None)
        most_similar_scores.append(None)

    
print("Successfully Found Words: ", round(success_word_query/size*100,3),"%")
execution_time = time.time() - start_time
print("Execution time: ", round(execution_time, 3),"s")

100%|██████████| 1000/1000 [1:07:27<00:00,  4.05s/it] 

Successfully Found Words:  44.7 %
Execution time:  4047.262 s





In [133]:
# Append to dataframe
mini_df = entity_disambiguation[:size].copy()
mini_df['preds_w2v_mostsimilar'] = most_similar_entities
mini_df['score_w2v_mostsimilar'] = most_similar_scores
mini_df['candidate_pool_mostsimilar'] = get_similar_candidate_pool
mini_df['candidate_scores_mostsimilar'] = get_similar_candidate_scores
mini_df.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity,full_mention_norm,preds_w2v_mostsimilar,score_w2v_mostsimilar,candidate_pool_mostsimilar,candidate_scores_mostsimilar
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union,eu,European Union,0.787421,"[European Union, European Free Trade Associati...","[0.7874206, 0.7662648, 0.76082164, 0.76052165,..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."


In [134]:
# Estimate length of time to run over full dataset
print("Estimated Duration for Full Dataset: ",\
     round((len(entity_disambiguation)/size)*execution_time/60/60,2), " hours")

Estimated Duration for Full Dataset:  32.95  hours


In [135]:
# Estimate length of time to run over just full_mentions without a guess from get_entity
print("Estimated Duration for Dataset without Prior Prediction: ",\
     round((len(entity_disambiguation[entity_disambiguation['preds_w2v_getentity'].isnull()])/size)*execution_time/60/60,2), " hours")

Estimated Duration for Dataset without Prior Prediction:  12.9  hours


## Calculate Accuracy of Most Similar Entity Predictions

In [136]:
# Calculate accuracy
accurate_predictions = (mini_df['preds_w2v_mostsimilar'] == mini_df['wikipedia_title']).sum()
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(mini_df) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 20.3%
****************************


In [137]:
# Calculate percentage of candidate pools with the correct answer present
# Use Wikipedia Title
# Necessary to determine if shuffling pool could even get the right answer
response_present = [mini_df['wikipedia_title'][i] in mini_df['candidate_pool_mostsimilar'][i] for i in range(len(mini_df))]
print(f"Correct answer is present in {round(sum(response_present) / len(mini_df) * 100, 3)}% of generated candidate pools via adapted Wikipedia2vec's most_similar() method.")

Correct answer is present in 25.6% of generated candidate pools via adapted Wikipedia2vec's most_similar() method.


## Save predictive dataframe for input to next step

In [138]:
#Final DF
mini_df.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity,full_mention_norm,preds_w2v_mostsimilar,score_w2v_mostsimilar,candidate_pool_mostsimilar,candidate_scores_mostsimilar
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union,eu,European Union,0.787421,"[European Union, European Free Trade Associati...","[0.7874206, 0.7662648, 0.76082164, 0.76052165,..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,peter blackburn,Russians in the United Kingdom,0.615564,[],[]
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,peter blackburn,Russians in the United Kingdom,0.615564,[],[]
5,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708.0,Brussels,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,brussels,Brussels,0.88707,"[Brussels, Ghent, Timeline of Brussels, Brusse...","[0.8870699, 0.7689268, 0.76867557, 0.7681168, ..."
6,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission,european commission,Brussels,0.88707,[],[]
7,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission,european commission,Brussels,0.88707,[],[]
8,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
9,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."


In [140]:
# Save dataframe
mini_df.to_csv(os.path.join(preds_path, "wikipedia2vec_most_similar_1000.csv"), index=False)