# Part Two: Enter Full Mention into Wikipedia2vec Get Entities as Baseline

Our most direct step is to use Wikipedia2vec's API and its get_entities() function to enter `full_mention` directly into that. We do that as our first process step, with the assumption that a returned result is near a 100% chance of being the correct page. We test this hypothesis at the end of this notebook.

#### Import Packages

In [1]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

## Load Processed ACY Input

In [2]:
# Base path to input
acy_path = '../../data/aida-conll-yago-dataset/'

# Load data
acy_input = pd.read_csv(os.path.join(acy_path, "Aida-Conll-Yago-Input.csv"), delimiter=",")
acy_input.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions
0,B,EU,,,,0,0,"['EU', 'German', 'British']"
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']"
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']"


In [3]:
# Re-name for this predictive step
preds_w2v_getentities = acy_input

## Import Wikipedia2Vec Model

In [4]:
# Package
from wikipedia2vec import Wikipedia2Vec

In [5]:
%%time
# Load unzipped pkl file with word embeddings
w2v = Wikipedia2Vec.load("../../embeddings/enwiki_20180420_100d.pkl")

CPU times: user 94.3 ms, sys: 180 ms, total: 274 ms
Wall time: 326 ms


#### Query using `full_mention`

In [6]:
# Track success rate for returned values
successes = 0
queries = 0
failed_searches = []
preds_w2v_getentity = []

# Run through each full_mention
for full_mention in tqdm(acy_input['full_mention']):
    
    # Query API
    entity = w2v.get_entity(full_mention)
    
    # Increment count
    queries += 1
    if entity is not None:
        successes += 1
    else:
        # Save X% of random failures to understand trends
        if np.random.uniform() <= 0.1:
            failed_searches.append(full_mention)
    
    # Save just title, not entity object
    try:
        entity = entity.title
    except:
        pass

    # Save prediction
    preds_w2v_getentity.append(entity)
print("Query Success Rate: ", round(successes/queries*100, 3),"%")

100%|██████████| 29312/29312 [00:00<00:00, 59098.81it/s]

Query Success Rate:  60.852 %





In [7]:
# Display random selection of failures
np.random.choice(failed_searches, 100)

array(['Gencor', 'Francisco Cea', 'Aoki', '.Robbie McEwen',
       'Marion County Board of Education', 'Huracan-Corrientes',
       'Sergey Makarov', 'WESTERN DIVISION', 'PARIS', 'Spanish',
       'Michael Andersson', 'Chris Adams', 'Pietro T.',
       "Saturday'sWorld Cup", 'Alberto Costa', 'French', 'Grand Slam',
       'R. Fay', 'Zarak Jahan Khan', 'Footscray', 'ASIAN CUP',
       'Pauline Konga', 'CAMPESE', 'Ronaldo Gonzalez', 'St Helens',
       'New York', 'Orii Corp', 'Indonesian', 'Chelsea', 'Edna Fernandes',
       'KANSAS CITY', 'ENGLISH', 'Slight', 'Tom Watson', 'Li Feng',
       'WESTERN DIVISION', 'Simona de Logu', 'Grand Slam', 'R. Irani',
       'Japanese', 'UEFA Fair Play', 'DALGLISH', 'McGregor',
       'KANSAS CITY', 'New York', 'First Battle of the Newbury bypass',
       'Brian Homewood', 'Hatoyama', 'PARIS',
       'Security Dynamics Technologies Inc.', 'Duran', 'John Gorst',
       'Eric Bergoust', 'EUROPEAN CUP', 'State ( central ) Bank',
       'Terry Price', 'H

In [8]:
# Append predictions to table
preds_w2v_getentities['preds_w2v_getentity'] = preds_w2v_getentity
preds_w2v_getentities.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
5,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708.0,Brussels,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
6,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
7,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",European Commission
8,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",
9,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",


### Assess Accuracy of Predictions

In [9]:
# Calculate accuracy
print("****************************")
overall_accuracy = (preds_w2v_getentities['preds_w2v_getentity'] == preds_w2v_getentities['wikipedia_title']).sum()
print(f"All-Inclusive Predictive Accuracy: {round(overall_accuracy / len(preds_w2v_getentities) * 100, 3)}%")
print("Note: All Full Mentions, whether they have a known true value or not.")

print("****************************")
filtered_df = preds_w2v_getentities[preds_w2v_getentities['wikipedia_page_ID'].notnull()]
mention_accuracy = (filtered_df['preds_w2v_getentity'] == filtered_df['wikipedia_title']).sum()
print(f"Filter Empty Mentions Mention Predictive Accuracy: {round(mention_accuracy / len(filtered_df) * 100, 3)}%")
print("Note: Only Full Mentions with a known true value")


print("****************************")
filtered_df = preds_w2v_getentities[preds_w2v_getentities['wikipedia_page_ID'].notnull()\
                                   & preds_w2v_getentities['preds_w2v_getentity'].notnull()]
result_accuracy = (filtered_df['preds_w2v_getentity'] == filtered_df['wikipedia_title']).sum()
print(f"Filter Empty Results Predictive Accuracy: {round(result_accuracy / len(filtered_df) * 100, 3)}%")
print("Note: Only full mentions with a known true value that returned a result from Wikipedia2vec")

****************************
All-Inclusive Predictive Accuracy: 42.218%
Note: All Full Mentions, whether they have a known true value or not.
****************************
Filter Empty Mentions Mention Predictive Accuracy: 55.6%
Note: Only Full Mentions with a known true value
****************************
Filter Empty Results Predictive Accuracy: 76.545%
Note: Only full mentions with a known true value that returned a result from Wikipedia2vec


In [10]:
preds_w2v_getentities[preds_w2v_getentities['wikipedia_title'].notnull()]['wikipedia_title']

1                               Germany
2                        United Kingdom
5                              Brussels
6                   European Commission
7                   European Commission
                      ...              
29307    England national football team
29308               1966 FIFA World Cup
29309               1966 FIFA World Cup
29310               1966 FIFA World Cup
29311                    Bobby Charlton
Name: wikipedia_title, Length: 22257, dtype: object

In [12]:
# Confirm effectiveness of Wikipedia2vec's get_entity() on known true answers (non-null)
# Basically, if we plug in the literal Wikipedia URL, will it return the right result
none_count = 0
answers = []
wikipedia_title = preds_w2v_getentities[preds_w2v_getentities['wikipedia_title'].notnull()]['wikipedia_title']
for r in tqdm(wikipedia_title):
    answer = w2v.get_entity(r)
    if answer is None:
        none_count += 1
    try:
        answers.append(answer.title)
    except AttributeError:
        answers.append(answer)
print("****************************")
print("Wikipedia2vec failed to return {} searches, {}%".format(none_count, round(none_count/len(wikipedia_title)*100,3)))
accurate_preds = np.array([True if i==j else False for i, j in zip(answers, list(filter(None, wikipedia_title)))]).sum()
print("****************************")
print(f"All-Inclusive Predictive Accuracy: {round(accurate_preds / len(answers)*100, 3)}%")
print("****************************")

100%|██████████| 22257/22257 [00:00<00:00, 127382.29it/s]

****************************
Wikipedia2vec failed to return 594 searches, 2.669%
****************************
All-Inclusive Predictive Accuracy: 91.374%
****************************





## Save predictive dataframe for input to next step

In [13]:
# See final output
preds_w2v_getentities.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,preds_w2v_getentity
0,B,EU,,,,0,0,"['EU', 'German', 'British']",European Union
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",


In [14]:
# Save dataframe
preds_path = '../../predictions/'
preds_w2v_getentities.to_csv(os.path.join(preds_path, "wikipedia2vec_getentities.csv"), index=False)