# Find Similar Entities

We have now returned all of the entities we'll get from direct querying of the package. We must now use alternate measures to identify candidate entities and select from that pool.

#### Import Packages

In [1]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

In [2]:
# Package
from wikipedia2vec import Wikipedia2Vec

# Class to compare type
from wikipedia2vec.dictionary import Entity

In [10]:
%%time
# Load unzipped pkl file with word embeddings
w2v = Wikipedia2Vec.load("../../embeddings/enwiki_20180420_100d.pkl")

CPU times: user 97.8 ms, sys: 181 ms, total: 279 ms
Wall time: 463 ms


## Load previously made predictions

In [5]:
# Base path to input
preds_path = '../../predictions/'

# Load data
entity_disambiguation = pd.read_csv(os.path.join(preds_path, "wikipedia2vec_getentities.csv"), delimiter=",")
entity_disambiguation.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",
1,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,0,0,"['EU', 'German', 'British']",
2,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
3,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
4,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
5,B,German,http://en.wikipedia.org/wiki/Germany,11867,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
6,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
7,B,Germany,http://en.wikipedia.org/wiki/Germany,11867,2,0,"['Germany', 'European Union', 'Werner Zwingman...",Germany
8,B,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union
9,I,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union


In [7]:
# Print current status for missing estimate count
print("Mentions without Prediction: ",\
      round(entity_disambiguation['preds_w2v_getentity'].isnull().sum()/len(entity_disambiguation)*100,3),"%")

Mentions without Prediction:  27.362 %


## Find most similar entity using Wikipedia2vec

We now turn to using a variation on Wikipedia2vec's `most_similar()` function to find, for entered words, the most similar entity. We do this as an added-layer, meaning only for those without an estimate, and for all full mentions, to compare performance.

In [15]:
# Normalize full_mentions to lower case for entry into most_similar() function
full_mention_norm = np.array([x.lower() for x in entity_disambiguation['full_mention']])
entity_disambiguation['full_mention_norm'] = full_mention_norm
entity_disambiguation.head(1)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity,full_mention_norm
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",,german


In [52]:
### Test single full mention query time
start_time = time.time()

# Print word
search_word = entity_disambiguation['full_mention_norm'][12]
print("Search Word: ", search_word)

# Get most similar word
count_similar = 100
similar = w2v.most_similar(w2v.get_word(str(search_word)), count_similar)

# Retrieve only entities from word
entities = []
return_similar = 10
for i in similar:
#     print(type(i[0]))
    if isinstance(i[0], Entity):
        entities.append(i)
    if len(entities) == return_similar:
        break
end_time = time.time()
print(f"Single Word Query Time: {round(end_time - start_time, 2)}s")
display(entities)

Search Word:  european union
Single Word Query Time: 11.48s


[]

In [86]:
# Prepare output array
most_similar_entities = []
most_similar_scores = []
get_similar_candidate_pool = []
get_similar_candidate_scores = []

# Track metrics
success_word_query = 0
start_time = time.time()

# Provide filter ability
size = 100

for mention in tqdm(entity_disambiguation['full_mention_norm'][:size]):
    
    # Retrieve w2v word from mention
    word = w2v.get_word(str(mention))
    
    # Save candidate pool
    candidate_pool = []
    candidate_scores = []
    
    if word is not None:
        success_word_query += 1
        
        # Search most similar words/entities from found word
        # Retrieve 50 most similar to ensure at least one entity returned
        similars = w2v.most_similar(word, 50)

        # Retrieve most similar entity
        most_similar = None
        for s in similars:
            if isinstance(s[0], Entity):
                candidate_pool.append(s[0].title)
                candidate_scores.append(s[1])
                if most_similar is None:
                    most_similar = s
                
    # Save lists
    get_similar_candidate_pool.append(candidate_pool)
    get_similar_candidate_scores.append(candidate_scores)
    
    if most_similar is not None:
        most_similar_entities.append(most_similar[0].title)
        most_similar_scores.append(most_similar[1])
    else:
        most_similar_entities.append(None)
        most_similar_scores.append(None)

    
print("Successfully Found Words: ", round(success_word_query/size*100,3),"%")
execution_time = time.time() - start_time
print("Execution time: ", round(execution_time, 3),"s")

100%|██████████| 100/100 [03:15<00:00,  1.96s/it]

Successfully Found Words:  65.0 %
Execution time:  195.931 s





In [87]:
# Append to dataframe
mini_df = entity_disambiguation[:size].copy()
mini_df['preds_w2v_mostsimilar'] = most_similar_entities
mini_df['score_w2v_mostsimilar'] = most_similar_scores
mini_df['candidate_pool_mostsimilar'] = get_similar_candidate_pool
mini_df['candidate_scores_mostsimilar'] = get_similar_candidate_scores
mini_df.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity,full_mention_norm,preds_w2v_mostsimilar,score_w2v_mostsimilar,candidate_pool_mostsimilar,candidate_scores_mostsimilar
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
1,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,0,0,"['EU', 'German', 'British']",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."
2,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,brussels,Brussels,0.88707,"[Brussels, Ghent, Timeline of Brussels, Brusse...","[0.8870699, 0.7689268, 0.76867557, 0.7681168, ..."


In [88]:
# Estimate length of time to run over full dataset
print("Estimated Duration for Full Dataset: ",\
     round((len(entity_disambiguation)/size)*execution_time/60/60,2), " hours")

Estimated Duration for Full Dataset:  12.11  hours


In [89]:
# Estimate length of time to run over just full_mentions without a guess from get_entity
print("Estimated Duration for Dataset without Prior Prediction: ",\
     round((len(entity_disambiguation[entity_disambiguation['preds_w2v_getentity'].isnull()])/size)*execution_time/60/60,2), " hours")

Estimated Duration for Dataset without Prior Prediction:  3.31  hours


## Calculate Accuracy of Most Similar Entity Predictions

In [90]:
# Define response variable
def replace_lines(text):
    return str(text).replace("_", " ")
response = [replace_lines(i.split("/")[-1]) if not isinstance(i, float) else None for i in entity_disambiguation['wikipedia_URL']]
response = response[:size] # todo delete for full run
response[:5]

['Germany',
 'United Kingdom',
 'Brussels',
 'European Commission',
 'European Commission']

In [91]:
# Calculate accuracy
accurate_predictions = (mini_df['preds_w2v_mostsimilar'] == response).sum()
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(mini_df) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 21.0%
****************************


## Save predictive dataframe for input to next step

In [92]:
#Final DF
mini_df.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_ID,sentence_id,doc_id,congruent_entities,preds_w2v_getentity,full_mention_norm,preds_w2v_mostsimilar,score_w2v_mostsimilar,candidate_pool_mostsimilar,candidate_scores_mostsimilar
0,B,German,http://en.wikipedia.org/wiki/Germany,11867,0,0,"['EU', 'German', 'British']",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
1,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,0,0,"['EU', 'German', 'British']",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."
2,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,brussels,Brussels,0.88707,"[Brussels, Ghent, Timeline of Brussels, Brusse...","[0.8870699, 0.7689268, 0.76867557, 0.7681168, ..."
3,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission,european commission,Brussels,0.88707,[],[]
4,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission,european commission,Brussels,0.88707,[],[]
5,B,German,http://en.wikipedia.org/wiki/Germany,11867,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,german,Culture of Germany,0.686803,"[Culture of Germany, 1860 in Germany, 1866 in ...","[0.68680257, 0.6840672, 0.6836185, 0.68068546,..."
6,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",,british,Russians in the United Kingdom,0.615564,"[Russians in the United Kingdom, Henry Wood (n...","[0.6155642, 0.60587233, 0.6025652, 0.5973172, ..."
7,B,Germany,http://en.wikipedia.org/wiki/Germany,11867,2,0,"['Germany', 'European Union', 'Werner Zwingman...",Germany,germany,Germany,0.737123,"[Germany, 2008-09 Biathlon World Cup – World C...","[0.7371232, 0.7318488, 0.7245289]"
8,B,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union,european union,Germany,0.737123,[],[]
9,I,European Union,http://en.wikipedia.org/wiki/European_Union,9317,2,0,"['Germany', 'European Union', 'Werner Zwingman...",European Union,european union,Germany,0.737123,[],[]


In [93]:
# Save dataframe
mini_df.to_csv(os.path.join(preds_path, "wikipedia2vec_most_similar.csv"), index=False)

In [95]:
mini_df['candidate_pool_mostsimilar'][51]

['Taipei',
 'Xinzhuang Baseball Stadium',
 'Department of Cultural Affairs, Taipei City Government',
 'Tianmu Baseball Stadium',
 'Taichung',
 'Taipei Gymnasium',
 'National Library of Public Information',
 'Hong-Gah Museum',
 'Taipei Dome',
 'Taipei City Arts Promotion Office',
 'Kaohsiung',
 'Tianmu, Shilin District',
 'Xinzhuang MRT station',
 'Kaohsiung Cultural Center',
 'Chung Cheng Martial Arts Stadium',
 'Sanduo Shopping District MRT station',
 'Zhongzheng District',
 'Xinzhuang District',
 'Jiantan MRT station',
 'Kuandu Museum of Fine Arts',
 'Taipei Expo Park',
 'Fisheries Agency',
 "Children's Art Museum in Taipei",
 'Taichung City Dadun Cultural Center',
 'Department of Rapid Transit Systems, Taipei City Government',
 'List of tourist attractions in Taipei',
 'Xinyi Special District',
 'Nangang District, Taipei',
 'Central District, Taichung']