# Generate Candidate Pool via Anchor Links

This notebook uses anchor links on Wikipedia, or hyperlinks from a string to a Wikipedia page, to propose a candidate pool of possible entities/pages for each full mention. We propose two methods of using anchor links: one sorts by most popular or viewed pages and the other by the most linked or central pages. We output both dataframes for evaluation.

#### Import Packages

In [1]:
import os
import time
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

### Load Processed ACY Input

In [2]:
# Base path to input
acy_path = '../../data/aida-conll-yago-dataset/'

# Load data
acy_input = pd.read_csv(os.path.join(acy_path, "Aida-Conll-Yago-Input.csv"), delimiter=",")
acy_input.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions
0,B,EU,,,,0,0,"['EU', 'German', 'British']"
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']"
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']"
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
5,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708.0,Brussels,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
6,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
7,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
8,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."
9,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm..."


### Load Kensho Target Dataset

This dataset provides anchor linkage statistics for Wikipedia pages and is provided by Kensho Technologies.

In [3]:
# Base path to KWNLP
kwnlp_path = '../../data/kwnlp'

In [4]:
# Load article data
article_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-article.csv'))
article_df.head(3)

Unnamed: 0,page_id,item_id,page_title,views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,12,6199,Anarchism,35558,40449,409,3826,371,1,0,0,0,0,0,0
1,25,38404,Autism,40081,47659,419,2313,309,0,1,0,0,0,0,0
2,39,101038,Albedo,10770,18766,293,3090,115,0,0,0,0,0,0,0


In [5]:
# Load anchor target counts data
anchor_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-anchor-target-counts.csv'))
anchor_df.head(3)

Unnamed: 0,anchor_text,target_page_id,count
0,United States,3434750,152451
1,World War II,32927,133668
2,India,14533,112069


To access Wikipedia page from ID: `https://en.wikipedia.org/wiki?curid={page_id}`

### Process Target Data

We apply normalization to the anchor text to make for simpler matching.

In [26]:
# Copy to new dataframe for processing
anchor_texts = anchor_df.copy()

In [27]:
# Define text normalization function
def normalize_text(text):
    """
    We define normalized as:
    - lowercase
    - strip whitespace
    - Spaces, not underlines
    - Remove punctuation (todo decide&implement)
    """
    return str(text).strip().lower().replace("_", " ")

In [28]:
# Apply normalization to anchor text
anchor_texts['norm_anchor_text'] = anchor_texts['anchor_text'].apply(normalize_text)
anchor_texts.head(3)

Unnamed: 0,anchor_text,target_page_id,count,norm_anchor_text
0,United States,3434750,152451,united states
1,World War II,32927,133668,world war ii
2,India,14533,112069,india


In [29]:
# Assess presence of Null values in anchor_text
print(f"There are {anchor_texts['anchor_text'].isnull().sum():,} 'None' values in anchor_text.")

There are 3,581 'None' values in anchor_text.


In [30]:
# Filter out None values
print("Before: {}".format(len(anchor_texts)))
anchor_texts = anchor_texts[anchor_texts['anchor_text'].notnull()]
print("After: {}".format(len(anchor_texts)))

Before: 15269229
After: 15265648


#### Join Page Data to Anchor Text Data

This provides us with information on page views and links.

In [32]:
%%time
# Merge at_count and article stats dataframes
anchor_texts = pd.merge(
    anchor_texts,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")

# Rename columns for clarity
anchor_texts = anchor_texts.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})

# Specify column ordering
anchor_texts = anchor_texts[[
    "norm_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]

# Display preview
anchor_texts.head(3)

CPU times: user 34.2 s, sys: 15.4 s, total: 49.6 s
Wall time: 51.4 s


Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559


# Develop Anchor Link Candidate Generation Models

## Anchor Link Frequency

This model generates a candidate pool of Wikipedia pages for each full mention by looking at the pages that string links to the most number of times.

In [34]:
%%time
# Sort dataframe by anchor text and then most frequently linked page
anchor_texts = anchor_texts.sort_values(['norm_anchor_text', 'anchor_target_count'], ascending=False)

CPU times: user 1min 24s, sys: 3.72 s, total: 1min 28s
Wall time: 1min 28s


In [35]:
%%time
# Return just the top N most linked entities to create our candidate pool for each anchor link
top_N = 10
anchor_text_link_frequency = anchor_texts.groupby('norm_anchor_text').head(top_N).reset_index(drop=True)

CPU times: user 59.8 s, sys: 2.25 s, total: 1min 2s
Wall time: 1min 2s


In [36]:
# Manually test United States to assess resulting dataframe
anchor_text_link_frequency[anchor_text_link_frequency['norm_anchor_text'] == 'united states']

Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
950752,united states,3434750,30,United_States,460156,152451
950753,united states,582488,164134,United_States_men's_national_soccer_team,25804,1466
950754,united states,647757,334526,United_States_women's_national_soccer_team,12292,594
950755,united states,1145226,1143805,United_States_national_rugby_union_team,1165,352
950756,united states,945923,913651,United_States_men's_national_ice_hockey_team,2537,257
950757,united states,924170,279283,Elections_in_the_United_States,14936,243
950758,united states,378405,3054793,Secondary_education_in_the_United_States,6907,225
950759,united states,980450,2738955,United_States_national_cricket_team,2043,223
950760,united states,6311052,1389353,United_States_Davis_Cup_team,335,218
950761,united states,89611,244847,United_States_men's_national_basketball_team,10624,177


In [37]:
# Assess remaining rows
print("Unique anchor links numbered {:,}".format(len(anchor_texts)))
print("Remaining dataframe contains {:,} rows".format(len(anchor_text_link_frequency)))

Unique anchor links numbered 15,265,648
Remaining dataframe contains 14,009,323 rows


In [83]:
# Testing
test_usa = anchor_text_link_frequency[anchor_text_link_frequency['norm_anchor_text'] == 'united states']
test_usa['target_page_views_pct'] = test_usa.groupby('norm_anchor_text')['target_page_views'].apply(lambda x: x/x.sum()).copy()
test_usa['anchor_target_count_pct'] = test_usa.groupby('norm_anchor_text')['anchor_target_count'].apply(lambda x: x/x.sum())
test_usa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_usa['target_page_views_pct'] = test_usa.groupby('norm_anchor_text')['target_page_views'].apply(lambda x: x/x.sum()).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_usa['anchor_target_count_pct'] = test_usa.groupby('norm_anchor_text')['anchor_target_count'].apply(lambda x: x/x.sum())


Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count,target_page_views_pct,anchor_target_count_pct
950752,united states,3434750,30,United_States,460156,152451,0.857222,0.975961
950753,united states,582488,164134,United_States_men's_national_soccer_team,25804,1466,0.04807,0.009385
950754,united states,647757,334526,United_States_women's_national_soccer_team,12292,594,0.022899,0.003803
950755,united states,1145226,1143805,United_States_national_rugby_union_team,1165,352,0.00217,0.002253
950756,united states,945923,913651,United_States_men's_national_ice_hockey_team,2537,257,0.004726,0.001645
950757,united states,924170,279283,Elections_in_the_United_States,14936,243,0.027824,0.001556
950758,united states,378405,3054793,Secondary_education_in_the_United_States,6907,225,0.012867,0.00144
950759,united states,980450,2738955,United_States_national_cricket_team,2043,223,0.003806,0.001428
950760,united states,6311052,1389353,United_States_Davis_Cup_team,335,218,0.000624,0.001396
950761,united states,89611,244847,United_States_men's_national_basketball_team,10624,177,0.019791,0.001133


In [84]:
%%time
# Calculating probabilities from raw counts
anchor_text_link_frequency['target_page_views_pct'] = anchor_text_link_frequency.groupby('norm_anchor_text')['target_page_views'].apply(lambda x: x/x.sum())
anchor_text_link_frequency['anchor_target_count_pct'] = anchor_text_link_frequency.groupby('norm_anchor_text')['anchor_target_count'].apply(lambda x: x/x.sum())

CPU times: user 1h 41min 16s, sys: 11min 41s, total: 1h 52min 57s
Wall time: 1h 55min 47s


In [95]:
# Manual inspection
anchor_text_link_frequency[anchor_text_link_frequency['norm_anchor_text'] == 'british']

Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count,target_page_views_pct,anchor_target_count_pct
11808745,british,31717,145,United_Kingdom,244414,25655,0.523446,0.657804
11808746,british,19097669,842438,British_people,7312,4821,0.01566,0.123612
11808747,british,13530298,23666,Great_Britain,38869,2867,0.083243,0.073511
11808748,british,4721,8680,British_Empire,68297,1541,0.146267,0.039512
11808749,british,158019,161885,Kingdom_of_Great_Britain,20902,1167,0.044764,0.029922
11808750,british,152256,174193,United_Kingdom_of_Great_Britain_and_Ireland,16838,1066,0.036061,0.027333
11808751,british,182410,3546736,Television_in_the_United_Kingdom,3376,910,0.00723,0.023333
11808752,british,4208015,129286,British_Raj,41304,377,0.088458,0.009666
11808753,british,4887,222595,British_Army,25213,299,0.053997,0.007666
11808754,british,1474098,4969584,British_Boxing_Board_of_Control,408,298,0.000874,0.007641


In [86]:
# # save dataframe in a pickle file
# anchor_text_link_frequency.to_pickle("../../predictions/anchor_text_link_frequency.pkl")

# # load saved file
# anchor_text_link_frequency = pd.read_pickle("../../predictions/anchor_text_link_frequency.pkl")

We did not reduce the dataframe by much, suggesting only a few anchor texts have more than our selected N number of distinct links. To append to our ACY Input data, we produce a dictionary of anchor text to its candidate pool.

In [19]:
# # In case of prior road, load saved json file before re-running the whole thing
# # Load dictionary
# with open('../../predictions/dict_anchor_pool_frequency.json', 'r') as filepath:
#     dict_anchor_pool_frequency = json.load(filepath)

In [90]:
%%time
# Group by anchor text to produce list of item IDs, page IDs and page titles (our candidate pools)
anchor_text_candidate_pools = anchor_text_link_frequency.groupby('norm_anchor_text')\
                                    [['target_page_id', 'target_page_title', 'target_item_id', 
                                      'target_page_views_pct', 'anchor_target_count_pct']]\
                                    .agg(lambda x: list(x)).reset_index()

CPU times: user 27min 8s, sys: 23.3 s, total: 27min 31s
Wall time: 27min 44s


In [96]:
# Manual inspection
anchor_text_candidate_pools[anchor_text_candidate_pools['norm_anchor_text'] == 'british']

Unnamed: 0,norm_anchor_text,target_page_id,target_page_title,target_item_id,target_page_views_pct,anchor_target_count_pct
1804575,british,"[31717, 19097669, 13530298, 4721, 158019, 1522...","[United_Kingdom, British_people, Great_Britain...","[145, 842438, 23666, 8680, 161885, 174193, 354...","[0.5234455478623271, 0.015659634251594983, 0.0...","[0.6578036460603575, 0.12361221507140843, 0.07..."


In [91]:
# # save dataframe in a pickle file
# anchor_text_candidate_pools.to_pickle("../../predictions/anchor_text_candidate_pools.pkl")

# # load saved file
# anchor_text_candidate_pools = pd.read_pickle("../../predictions/anchor_text_candidate_pools.pkl")

In [101]:
%%time
# Add to dictionary for faster searching later in the pipeline

# Create dictionary
dict_anchor_pool_frequency = {}

# Add lists to dictionary with anchor text as search term
# This should match the full mention search when measuring accuracy later
for i in tqdm(range(len(anchor_text_candidate_pools))):
    row = anchor_text_candidate_pools.loc[i]
    dict_anchor_pool_frequency[row['norm_anchor_text']] = [row['target_page_id'], row['target_page_title'], 
                                                           row['target_item_id'], row['target_page_views_pct'], 
                                                           row['anchor_target_count_pct']]

100%|██████████| 11327029/11327029 [23:04<00:00, 8178.49it/s]

CPU times: user 22min 20s, sys: 34.5 s, total: 22min 54s
Wall time: 23min 4s





#### Demonstrate search performance boost of dictionary

In [102]:
%%time
# Demonstrate search benefit of storing as dictionary
o = dict_anchor_pool_frequency['united states']

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.82 µs


In [103]:
%%time
# Compare Pandas dataframe search to dictionary search
o = anchor_text_candidate_pools[anchor_text_candidate_pools['norm_anchor_text'] == 'united states']

CPU times: user 2.14 s, sys: 81.1 ms, total: 2.22 s
Wall time: 2.28 s


In [104]:
# Save dictionary
with open('../../predictions/dict_anchor_pool_frequency.json', 'w') as filepath:
    json.dump(dict_anchor_pool_frequency, filepath)

## Anchor Link Popularity

This model generates a candidate pool of Wikipedia pages for each full mention by looking at the popularity of pages that string has linked to and sorting by the pages with the most views.

In [105]:
%%time
# Sort dataframe by anchor text and then most frequently linked page
anchor_texts = anchor_texts.sort_values(['norm_anchor_text', 'target_page_views'], ascending=False)

CPU times: user 1min 10s, sys: 5.97 s, total: 1min 16s
Wall time: 1min 18s


In [106]:
%%time
# Return just the top N most viewed entities to create our candidate pool for each anchor link
top_N = 10
anchor_text_link_popularity = anchor_texts.groupby('norm_anchor_text').head(top_N).reset_index(drop=True)

CPU times: user 56.1 s, sys: 2.24 s, total: 58.3 s
Wall time: 58.4 s


In [107]:
# Manually test United States to assess resulting dataframe
anchor_text_link_popularity[anchor_text_link_popularity['norm_anchor_text'] == 'united states']

Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
950752,united states,3434750,30,United_States,460156,152451
950753,united states,3434750,30,United_States,460156,5
950754,united states,63136490,83873577,COVID-19_pandemic_in_the_United_States,428030,31
950755,united states,58993617,41174436,2020_Formula_One_World_Championship,343066,1
950756,united states,44751865,19600530,Black_Lives_Matter,250974,1
950757,united states,12610470,1682357,List_of_states_and_territories_of_the_United_S...,185044,1
950758,united states,54803678,37093861,Antifa_(United_States),183516,1
950759,united states,1649321,131079,List_of_United_States_cities_by_population,163698,1
950760,united states,18618239,35657,U.S._state,155646,3
950761,united states,3356,1124,Bill_Clinton,155162,1


In [108]:
# Assess remaining rows
print("Unique anchor links numbered {:,}".format(len(anchor_texts)))
print("Remaining dataframe contains {:,} rows".format(len(anchor_text_link_popularity)))

Unique anchor links numbered 15,265,648
Remaining dataframe contains 14,009,323 rows


In [109]:
%%time
# Calculating probabilities from raw counts
anchor_text_link_popularity['target_page_views_pct'] = anchor_text_link_popularity.groupby('norm_anchor_text')['target_page_views'].apply(lambda x: x/x.sum())
anchor_text_link_popularity['anchor_target_count_pct'] = anchor_text_link_popularity.groupby('norm_anchor_text')['anchor_target_count'].apply(lambda x: x/x.sum())

CPU times: user 1h 41min 27s, sys: 21min 8s, total: 2h 2min 35s
Wall time: 2h 9min 49s


In [113]:
# # save dataframe in a pickle file
# anchor_text_link_popularity.to_pickle("../../predictions/anchor_text_link_popularity.pkl")

# # load saved file
# anchor_text_link_popularity = pd.read_pickle("../../predictions/anchor_text_link_popularity.pkl")

In [2]:
# # In case of prior road, load saved json file before re-running the whole thing
# # Load dictionary
# with open('../../predictions/dict_anchor_pool_popularity.json', 'r') as filepath:
#     dict_anchor_pool_popularity = json.load(filepath)

In [110]:
%%time
# Group by anchor text to produce list of item IDs, page IDs and page titles (our candidate pools)
anchor_text_candidate_pools = anchor_text_link_popularity.groupby('norm_anchor_text')\
                                    [['target_page_id', 'target_page_title', 'target_item_id',
                                      'target_page_views_pct', 'anchor_target_count_pct']]\
                                    .agg(lambda x: list(x)).reset_index()

CPU times: user 27min 20s, sys: 1min 52s, total: 29min 12s
Wall time: 30min 4s


In [111]:
%%time
## Add to dictionary for faster searching later in the pipeline

# Create dictionary
dict_anchor_pool_popularity = {}

# Add lists to dictionary with anchor text as search term
# This should match the full mention search when measuring accuracy later
for i in tqdm(range(len(anchor_text_candidate_pools))):
    row = anchor_text_candidate_pools.loc[i]
    dict_anchor_pool_popularity[row['norm_anchor_text']] = [row['target_page_id'], row['target_page_title'], 
                                                            row['target_item_id'], row['target_page_views_pct'], 
                                                            row['anchor_target_count_pct']]

100%|██████████| 11327029/11327029 [28:59<00:00, 6512.09it/s]


CPU times: user 27min 49s, sys: 47.1 s, total: 28min 36s
Wall time: 28min 59s


In [112]:
# Save dictionary
with open('../../predictions/dict_anchor_pool_popularity.json', 'w') as filepath:
    json.dump(dict_anchor_pool_popularity, filepath)

# Assess Accuracy of Anchor Link Models without Congruence

For each full mention in our ACY input dataset, we now append the generated candidate pool as a column and save our predictions.

In [117]:
# Normalize full mentions for direct comparison with normalized anchor texts
acy_input['norm_full_mention'] = acy_input['full_mention'].apply(normalize_text)
acy_input.head()

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",peter blackburn
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",peter blackburn


## Anchor Link Frequency

In [130]:
# Copy input dataframe
preds_anchor_frequency = acy_input.copy()

In [131]:
# For each full mention, retrieve the candidate pool generated by the model
mention_candidate_pools_page_ids = []
mention_candidate_pools_titles = []
mention_candidate_pools_item_ids = []
mention_candidate_pools_page_views_pct = []
mention_candidate_pools_target_count_pct = []

# Track metrics
oov_error = 0

for i in tqdm(range(len(acy_input))):
    
    # Retrieve normalized full mention
    full_mention = acy_input['norm_full_mention'][i]
    
    # Retrieve candidate pools for full mention
    try:
        dicts = dict_anchor_pool_frequency[full_mention]
    except KeyError:
        oov_error += 1
        dicts = (None, None, None, None, None)
        
    candidate_pool_page_ids = dicts[0]
    candidate_pool_titles = dicts[1]
    candidate_pool_item_ids = dicts[2]
    candidate_pools_page_views_pct = dicts[3]
    candidate_pools_target_count_pct = dicts[4]
    
    # Save candidate pools
    mention_candidate_pools_page_ids.append(candidate_pool_page_ids)
    mention_candidate_pools_titles.append(candidate_pool_titles)
    mention_candidate_pools_item_ids.append(candidate_pool_item_ids)
    mention_candidate_pools_page_views_pct.append(candidate_pools_page_views_pct)
    mention_candidate_pools_target_count_pct.append(candidate_pools_target_count_pct)
    
preds_anchor_frequency['mention_candidate_pools_page_ids'] = mention_candidate_pools_page_ids
preds_anchor_frequency['mention_candidate_pools_titles'] = mention_candidate_pools_titles
preds_anchor_frequency['mention_candidate_pools_item_ids'] = mention_candidate_pools_item_ids
preds_anchor_frequency['mention_candidate_pools_page_views_pct'] = mention_candidate_pools_page_views_pct
preds_anchor_frequency['mention_candidate_pools_target_count_pct'] = mention_candidate_pools_target_count_pct

100%|██████████| 29312/29312 [00:00<00:00, 104006.29it/s]


In [132]:
print(f"We received {oov_error:,} Out-of-Vocabulary Errors.")

We received 4,625 Out-of-Vocabulary Errors.


In [133]:
# Preview dataframe
preds_anchor_frequency.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention,mention_candidate_pools_page_ids,mention_candidate_pools_titles,mention_candidate_pools_item_ids,mention_candidate_pools_page_views_pct,mention_candidate_pools_target_count_pct
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu,"[9317, 9239, 21347120, 9477, 1882861, 3261189,...","[European_Union, Europe, Eu,_Seine-Maritime, E...","[458, 46, 211593, 1396, 363404, 3327447, 40537...","[0.4089645768445524, 0.3670737522912537, 0.001...","[0.940090771558245, 0.025113464447806353, 0.02..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german,"[11867, 11884, 152735, 21212, 12674, 290327, 1...","[Germany, German_language, Germans, Nazi_Germa...","[183, 188, 42884, 7318, 43287, 141817, 181287,...","[0.40794106453816165, 0.10446023292915596, 0.0...","[0.4395598840735998, 0.3033295140527061, 0.154..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british,"[31717, 19097669, 13530298, 4721, 158019, 1522...","[United_Kingdom, British_people, Great_Britain...","[145, 842438, 23666, 8680, 161885, 174193, 354...","[0.5234455478623271, 0.015659634251594983, 0.0...","[0.6578036460603575, 0.12361221507140843, 0.07..."


In [134]:
# Calculate accuracy
accurate_predictions = 0
for i in range(len(preds_anchor_frequency)):
    try:
        if preds_anchor_frequency['wikipedia_page_ID'][i] == preds_anchor_frequency['mention_candidate_pools_page_ids'][i][0]:
            accurate_predictions += 1
    except TypeError:
        pass
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(preds_anchor_frequency) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 54.609%
****************************


In [135]:
# Calculate percentage of candidate pools with the correct answer present
# Necessary to determine if shuffling pool could even get the right answer
response_present = 0
for i in range(len(preds_anchor_frequency)):
    try:
        if preds_anchor_frequency['wikipedia_page_ID'][i] in preds_anchor_frequency['mention_candidate_pools_page_ids'][i]:
            response_present += 1
    except TypeError:
        pass
print(f"Correct answer is present in {round(response_present / len(preds_anchor_frequency) * 100, 3)}% of generated candidate pools via Anchor Links Frequency method.")

Correct answer is present in 68.661% of generated candidate pools via Anchor Links Frequency method.


In [136]:
# Base path to input
preds_path = '../../predictions/'

# Save candidate pools dataframe
preds_anchor_frequency.to_csv(os.path.join(preds_path, "anchortext_frequency.csv"), index=False)

## Anchor Link Popularity

In [137]:
# Copy input dataframe
preds_anchor_popularity = acy_input.copy()

In [138]:
preds_anchor_popularity.head(10)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british
3,B,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",peter blackburn
4,I,Peter Blackburn,,,,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",peter blackburn
5,B,BRUSSELS,http://en.wikipedia.org/wiki/Brussels,3708.0,Brussels,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",brussels
6,B,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",european commission
7,I,European Commission,http://en.wikipedia.org/wiki/European_Commission,9974.0,European Commission,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",european commission
8,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",german
9,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,1,0,"['Peter Blackburn', 'BRUSSELS', 'European Comm...",british


In [139]:
# For each full mention, retrieve the candidate pool generated by the model
mention_candidate_pools_page_ids = []
mention_candidate_pools_titles = []
mention_candidate_pools_item_ids = []
mention_candidate_pools_page_views_pct = []
mention_candidate_pools_target_count_pct = []

# Track metrics
oov_error = 0

for i in tqdm(range(len(acy_input))):
    
    # Retrieve normalized full mention
    full_mention = acy_input['norm_full_mention'][i]
    
    # Retrieve candidate pools for full mention
    try:
        dicts = dict_anchor_pool_popularity[full_mention]
    except KeyError:
        oov_error += 1
        dicts = (None, None, None, None, None)
        
    candidate_pool_page_ids = dicts[0]
    candidate_pool_titles = dicts[1]
    candidate_pool_item_ids = dicts[2]
    candidate_pools_page_views_pct = dicts[3]
    candidate_pools_target_count_pct = dicts[4]
    
    # Save candidate pools
    mention_candidate_pools_page_ids.append(candidate_pool_page_ids)
    mention_candidate_pools_titles.append(candidate_pool_titles)
    mention_candidate_pools_item_ids.append(candidate_pool_item_ids)
    mention_candidate_pools_page_views_pct.append(candidate_pools_page_views_pct)
    mention_candidate_pools_target_count_pct.append(candidate_pools_target_count_pct)
    
preds_anchor_popularity['mention_candidate_pools_page_ids'] = mention_candidate_pools_page_ids
preds_anchor_popularity['mention_candidate_pools_titles'] = mention_candidate_pools_titles
preds_anchor_popularity['mention_candidate_pools_item_ids'] = mention_candidate_pools_item_ids
preds_anchor_popularity['mention_candidate_pools_page_views_pct'] = mention_candidate_pools_page_views_pct
preds_anchor_popularity['mention_candidate_pools_target_count_pct'] = mention_candidate_pools_target_count_pct

100%|██████████| 29312/29312 [00:05<00:00, 5647.90it/s] 


In [140]:
print(f"We received {oov_error:,} Out-of-Vocabulary Errors.")

We received 4,625 Out-of-Vocabulary Errors.


In [141]:
# Preview dataframe
preds_anchor_popularity.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention,mention_candidate_pools_page_ids,mention_candidate_pools_titles,mention_candidate_pools_item_ids,mention_candidate_pools_page_views_pct,mention_candidate_pools_target_count_pct
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu,"[9317, 9239, 9891, 9472, 10890716, 2780146, 18...","[European_Union, Europe, Entropy, Euro, Member...","[458, 46, 45003, 4916, 185441, 932442, 8268, 8...","[0.2821412415438077, 0.25324111200213845, 0.12...","[0.9694227769110765, 0.025897035881435257, 0.0..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german,"[11867, 11867, 27318, 21148, 21212, 21212, 269...","[Germany, Germany, Singapore, Netherlands, Naz...","[183, 183, 334, 55, 7318, 7318, 40, 12548, 825...","[0.15982921005701556, 0.15982921005701556, 0.1...","[0.9138583339171863, 0.00014012471099278357, 3..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british,"[3434750, 31717, 31717, 19344654, 26061, 85699...","[United_States, United_Kingdom, United_Kingdom...","[30, 145, 145, 9531, 172771, 1860, 21, 22, 868...","[0.2915145444950621, 0.1548393064052541, 0.154...","[0.00010882576994232234, 0.9306417092900933, 7..."


In [142]:
# Calculate accuracy
accurate_predictions = 0
for i in range(len(preds_anchor_popularity)):
    try:
        if preds_anchor_popularity['wikipedia_page_ID'][i] == preds_anchor_popularity['mention_candidate_pools_page_ids'][i][0]:
            accurate_predictions += 1
    except TypeError:
        pass
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(preds_anchor_popularity) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 46.571%
****************************


In [143]:
# Calculate percentage of candidate pools with the correct answer present
# Necessary to determine if shuffling pool could even get the right answer
response_present = 0
for i in range(len(preds_anchor_popularity)):
    try:
        if preds_anchor_popularity['wikipedia_page_ID'][i] in preds_anchor_popularity['mention_candidate_pools_page_ids'][i]:
            response_present += 1
    except TypeError:
        pass
print(f"Correct answer is present in {round(response_present / len(preds_anchor_popularity) * 100, 3)}% of generated candidate pools via Anchor Links popularity method.")

Correct answer is present in 67.535% of generated candidate pools via Anchor Links popularity method.


In [144]:
# Save candidate pools dataframe
preds_anchor_popularity.to_csv(os.path.join(preds_path, "anchortext_popularity.csv"), index=False)