# Generate Candidate Pool via Anchor Links

This notebook uses anchor links on Wikipedia, or hyperlinks from a string to a Wikipedia page, to propose a candidate pool of possible entities/pages for each full mention. We propose two methods of using anchor links: one sorts by most popular or viewed pages and the other by the most linked or central pages. We output both dataframes for evaluation.

#### Import Packages

In [1]:
import os
import time
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Progress bar
from tqdm import tqdm

### Load Processed ACY Input

In [2]:
# Base path to input
acy_path = '../../data/aida-conll-yago-dataset/'

# Load data
acy_input = pd.read_csv(os.path.join(acy_path, "Aida-Conll-Yago-Input.csv"), delimiter=",")
acy_input.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions
0,B,EU,,,,0,0,"['EU', 'German', 'British']"
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']"
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']"


### Load Kensho Target Dataset

This dataset provides anchor linkage statistics for Wikipedia pages and is provided by Kensho Technologies.

In [3]:
# Base path to KWNLP
kwnlp_path = '../../data/kwnlp'

In [4]:
# Load article data
article_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-article.csv'))
article_df.head(3)

Unnamed: 0,page_id,item_id,page_title,views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,12,6199,Anarchism,35558,40449,409,3826,371,1,0,0,0,0,0,0
1,25,38404,Autism,40081,47659,419,2313,309,0,1,0,0,0,0,0
2,39,101038,Albedo,10770,18766,293,3090,115,0,0,0,0,0,0,0


In [5]:
# Load anchor target counts data
anchor_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-anchor-target-counts.csv'))
anchor_df.head(3)

Unnamed: 0,anchor_text,target_page_id,count
0,United States,3434750,152451
1,World War II,32927,133668
2,India,14533,112069


### Process Target Data

We apply normalization to the anchor text to make for simpler matching.

In [6]:
# Copy to new dataframe for processing
anchor_texts = anchor_df.copy()

In [7]:
# Define text normalization function
def normalize_text(text):
    """
    We define normalized as:
    - lowercase
    - strip whitespace
    - Spaces, not underlines
    - Remove punctuation (todo decide&implement)
    """
    return str(text).strip().lower().replace("_", " ")

In [8]:
# Apply normalization to anchor text
anchor_texts['norm_anchor_text'] = anchor_texts['anchor_text'].apply(normalize_text)

In [9]:
# Assess presence of Null values in anchor_text
print(f"There are {anchor_texts['anchor_text'].isnull().sum():,} 'None' values in anchor_text.")

There are 3,581 'None' values in anchor_text.


In [10]:
# Filter out None values
print("Before: {}".format(len(anchor_texts)))
anchor_texts = anchor_texts[anchor_texts['anchor_text'].notnull()]
print("After: {}".format(len(anchor_texts)))

Before: 15269229
After: 15265648


#### Join Page Data to Anchor Text Data

This provides us with information on page views and links.

In [11]:
%%time
# Merge at_count and article stats dataframes
anchor_texts = pd.merge(
    anchor_texts,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")

# Rename columns for clarity
anchor_texts = anchor_texts.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})

# Specify column ordering
anchor_texts = anchor_texts[[
    "norm_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]

# Display preview
anchor_texts.head(3)

CPU times: user 36.6 s, sys: 30.1 s, total: 1min 6s
Wall time: 1min 20s


Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559


# Develop Anchor Link Candidate Generation Models

## Anchor Link Frequency

This model generates a candidate pool of Wikipedia pages for each full mention by looking at the pages that string links to the most number of times.

In [12]:
%%time
# Sort dataframe by anchor text and then most frequently linked page
anchor_texts = anchor_texts.sort_values(['norm_anchor_text', 'anchor_target_count'], ascending=False)

CPU times: user 1min 21s, sys: 5 s, total: 1min 26s
Wall time: 1min 27s


In [13]:
%%time
# Return just the top N most linked entities to create our candidate pool for each anchor link
top_N = 10
anchor_text_link_frequency = anchor_texts.groupby('norm_anchor_text').head(top_N).reset_index(drop=True)

CPU times: user 58.3 s, sys: 5.38 s, total: 1min 3s
Wall time: 1min 6s


In [14]:
# Manually test United States to assess resulting dataframe
anchor_text_link_frequency[anchor_text_link_frequency['norm_anchor_text'] == 'united states']

Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
950752,united states,3434750,30,United_States,460156,152451
950753,united states,582488,164134,United_States_men's_national_soccer_team,25804,1466
950754,united states,647757,334526,United_States_women's_national_soccer_team,12292,594
950755,united states,1145226,1143805,United_States_national_rugby_union_team,1165,352
950756,united states,945923,913651,United_States_men's_national_ice_hockey_team,2537,257
950757,united states,924170,279283,Elections_in_the_United_States,14936,243
950758,united states,378405,3054793,Secondary_education_in_the_United_States,6907,225
950759,united states,980450,2738955,United_States_national_cricket_team,2043,223
950760,united states,6311052,1389353,United_States_Davis_Cup_team,335,218
950761,united states,89611,244847,United_States_men's_national_basketball_team,10624,177


In [15]:
# Assess remaining rows
print("Unique anchor links numbered {:,}".format(len(anchor_texts)))
print("Remaining dataframe contains {:,} rows".format(len(anchor_text_link_frequency)))

Unique anchor links numbered 15,265,648
Remaining dataframe contains 14,009,323 rows


We did not reduce the dataframe by much, suggesting only a few anchor texts have more than our selected N number of distinct links. To append to our ACY Input data, we produce a dictionary of anchor text to its candidate pool.

In [16]:
# # In case of prior road, load saved json file before re-running the whole thing
# # Load dictionary
# with open('../../predictions/dict_anchor_pool_frequency.json', 'r') as filepath:
#     dict_anchor_pool_frequency = json.load(filepath)

In [17]:
%%time
# Group by anchor text to produce list of item IDs, page IDs and page titles (our candidate pools)
anchor_text_candidate_pools = anchor_text_link_frequency.groupby('norm_anchor_text')\
                                    [['target_page_id', 'target_page_title', 'target_item_id']]\
                                    .agg(lambda x: list(x)).reset_index()

CPU times: user 10min 52s, sys: 21.8 s, total: 11min 14s
Wall time: 11min 23s


In [18]:
%%time
## Add to dictionary for faster searching later in the pipeline

# Create dictionary
dict_anchor_pool_frequency = {}

# Add lists to dictionary with anchor text as search term
# This should match the full mention search when measuring accuracy later
for i in tqdm(range(len(anchor_text_candidate_pools))):
    row = anchor_text_candidate_pools.loc[i]
    dict_anchor_pool_frequency[row['norm_anchor_text']] = [row['target_page_id'], row['target_page_title'], row['target_item_id']]

100%|██████████| 11327029/11327029 [19:56<00:00, 9464.89it/s] 

CPU times: user 18min 16s, sys: 1min 11s, total: 19min 27s
Wall time: 19min 56s





#### Demonstrate search performance boost of dictionary

In [19]:
%%time
# Demonstrate search benefit of storing as dictionary
o = dict_anchor_pool_frequency['united states']

CPU times: user 4 µs, sys: 7 µs, total: 11 µs
Wall time: 12.9 µs


In [20]:
%%time
# Compare Pandas dataframe search to dictionary search
o = anchor_text_candidate_pools[anchor_text_candidate_pools['norm_anchor_text'] == 'united states']

CPU times: user 5.31 s, sys: 7.88 s, total: 13.2 s
Wall time: 16.3 s


In [21]:
# Save dictionary
with open('../../predictions/dict_anchor_pool_frequency.json', 'w') as filepath:
    json.dump(dict_anchor_pool_frequency, filepath)

## Anchor Link Popularity

This model generates a candidate pool of Wikipedia pages for each full mention by looking at the popularity of pages that string has linked to and sorting by the pages with the most views.

In [22]:
%%time
# Sort dataframe by anchor text and then most frequently linked page
anchor_texts = anchor_texts.sort_values(['norm_anchor_text', 'target_page_views'], ascending=False)

CPU times: user 1min 7s, sys: 8.52 s, total: 1min 15s
Wall time: 1min 18s


In [23]:
%%time
# Return just the top N most viewed entities to create our candidate pool for each anchor link
top_N = 10
anchor_text_link_popularity = anchor_texts.groupby('norm_anchor_text').head(top_N).reset_index(drop=True)

CPU times: user 55.7 s, sys: 4.26 s, total: 59.9 s
Wall time: 1min


In [24]:
# Manually test United States to assess resulting dataframe
anchor_text_link_popularity[anchor_text_link_popularity['norm_anchor_text'] == 'united states']

Unnamed: 0,norm_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
950752,united states,3434750,30,United_States,460156,152451
950753,united states,3434750,30,United_States,460156,5
950754,united states,63136490,83873577,COVID-19_pandemic_in_the_United_States,428030,31
950755,united states,58993617,41174436,2020_Formula_One_World_Championship,343066,1
950756,united states,44751865,19600530,Black_Lives_Matter,250974,1
950757,united states,12610470,1682357,List_of_states_and_territories_of_the_United_S...,185044,1
950758,united states,54803678,37093861,Antifa_(United_States),183516,1
950759,united states,1649321,131079,List_of_United_States_cities_by_population,163698,1
950760,united states,18618239,35657,U.S._state,155646,3
950761,united states,3356,1124,Bill_Clinton,155162,1


In [25]:
# Assess remaining rows
print("Unique anchor links numbered {:,}".format(len(anchor_texts)))
print("Remaining dataframe contains {:,} rows".format(len(anchor_text_link_popularity)))

Unique anchor links numbered 15,265,648
Remaining dataframe contains 14,009,323 rows


In [26]:
# # In case of prior road, load saved json file before re-running the whole thing
# # Load dictionary
# with open('../../predictions/dict_anchor_pool_popularity.json', 'r') as filepath:
#     dict_anchor_pool_popularity = json.load(filepath)

In [27]:
%%time
# Group by anchor text to produce list of item IDs, page IDs and page titles (our candidate pools)
anchor_text_candidate_pools = anchor_text_link_popularity.groupby('norm_anchor_text')\
                                    [['target_page_id', 'target_page_title', 'target_item_id']]\
                                    .agg(lambda x: list(x)).reset_index()

CPU times: user 11min 7s, sys: 2min 11s, total: 13min 18s
Wall time: 14min 26s


In [28]:
%%time
## Add to dictionary for faster searching later in the pipeline

# Create dictionary
dict_anchor_pool_popularity = {}

# Add lists to dictionary with anchor text as search term
# This should match the full mention search when measuring accuracy later
for i in tqdm(range(len(anchor_text_candidate_pools))):
    row = anchor_text_candidate_pools.loc[i]
    dict_anchor_pool_popularity[row['norm_anchor_text']] = [row['target_page_id'], row['target_page_title'], row['target_item_id']]

100%|██████████| 11327029/11327029 [20:57<00:00, 9009.42it/s] 


CPU times: user 18min 18s, sys: 1min 48s, total: 20min 6s
Wall time: 20min 57s


In [29]:
# Save dictionary
with open('../../predictions/dict_anchor_pool_popularity.json', 'w') as filepath:
    json.dump(dict_anchor_pool_popularity, filepath)

# Assess Accuracy of Anchor Link Models without Congruence

For each full mention in our ACY input dataset, we now append the generated candidate pool as a column and save our predictions.

In [30]:
# Normalize full mentions for direct comparison with normalized anchor texts
acy_input['norm_full_mention'] = acy_input['full_mention'].apply(normalize_text)

## Anchor Link Frequency

In [49]:
# Copy input dataframe
preds_anchor_frequency = acy_input.copy()

In [50]:
# For each full mention, retrieve the candidate pool generated by the model
mention_candidate_pools_page_ids = []
mention_candidate_pools_item_ids = []
mention_candidate_pools_titles = []

# Track metrics
oov_error = 0

for i in tqdm(range(len(acy_input))):
    
    # Retrieve normalized full mention
    full_mention = acy_input['norm_full_mention'][i]
    
    # Retrieve candidate pools for full mention
    try:
        dicts = dict_anchor_pool_frequency[full_mention]
    except KeyError:
        oov_error += 1
        dicts = (None, None, None)
        
    candidate_pool_page_ids = dicts[0]
    candidate_pool_titles = dicts[1]
    candidate_pool_item_ids = dicts[2]
    
    # Save candidate pools
    mention_candidate_pools_page_ids.append(candidate_pool_page_ids)
    mention_candidate_pools_item_ids.append(candidate_pool_item_ids)
    mention_candidate_pools_titles.append(candidate_pool_titles)
    
preds_anchor_frequency['mention_candidate_pools_page_ids'] = mention_candidate_pools_page_ids
preds_anchor_frequency['mention_candidate_pools_item_ids'] = mention_candidate_pools_item_ids
preds_anchor_frequency['candidate_pools_titles'] = mention_candidate_pools_titles

100%|██████████| 29312/29312 [00:00<00:00, 86908.55it/s]


In [35]:
print(f"We received {oov_error:,} Out-of-Vocabulary Errors.")

We received 4,625 Out-of-Vocabulary Errors.


In [36]:
# Preview dataframe
preds_anchor_frequency.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention,mention_candidate_pools_page_ids,mention_candidate_pools_item_ids,candidate_pools_titles
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu,"[9317, 9239, 21347120, 9477, 1882861, 3261189,...","[458, 46, 211593, 1396, 363404, 3327447, 40537...","[European_Union, Europe, Eu,_Seine-Maritime, E..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german,"[11867, 11884, 152735, 21212, 12674, 290327, 1...","[183, 188, 42884, 7318, 43287, 141817, 181287,...","[Germany, German_language, Germans, Nazi_Germa..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british,"[31717, 19097669, 13530298, 4721, 158019, 1522...","[145, 842438, 23666, 8680, 161885, 174193, 354...","[United_Kingdom, British_people, Great_Britain..."


In [39]:
# Calculate accuracy
accurate_predictions = 0
for i in range(len(preds_anchor_frequency)):
    try:
        if preds_anchor_frequency['wikipedia_page_ID'][i] == preds_anchor_frequency['mention_candidate_pools_page_ids'][i][0]:
            accurate_predictions += 1
    except TypeError:
        pass
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(preds_anchor_frequency) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 54.609%
****************************


In [40]:
# Calculate percentage of candidate pools with the correct answer present
# Necessary to determine if shuffling pool could even get the right answer
response_present = 0
for i in range(len(preds_anchor_frequency)):
    try:
        if preds_anchor_frequency['wikipedia_page_ID'][i] in preds_anchor_frequency['mention_candidate_pools_page_ids'][i]:
            response_present += 1
    except TypeError:
        pass
print(f"Correct answer is present in {round(response_present / len(preds_anchor_frequency) * 100, 3)}% of generated candidate pools via Anchor Links Frequency method.")

Correct answer is present in 68.661% of generated candidate pools via Anchor Links Frequency method.


In [42]:
# Base path to input
preds_path = '../../predictions/'

# Save candidate pools dataframe
preds_anchor_frequency.to_csv(os.path.join(preds_path, "anchortext_frequency.csv"), index=False)

## Anchor Link Popularity

In [43]:
# Copy input dataframe
preds_anchor_popularity = acy_input.copy()

In [51]:
# For each full mention, retrieve the candidate pool generated by the model
mention_candidate_pools_page_ids = []
mention_candidate_pools_item_ids = []
mention_candidate_pools_titles = []

# Track metrics
oov_error = 0

for i in tqdm(range(len(acy_input))):
    
    # Retrieve normalized full mention
    full_mention = acy_input['norm_full_mention'][i]
    
    # Retrieve candidate pools for full mention
    try:
        dicts = dict_anchor_pool_popularity[full_mention]
    except KeyError:
        oov_error += 1
        dicts = (None, None, None)
        
    candidate_pool_page_ids = dicts[0]
    candidate_pool_titles = dicts[1]
    candidate_pool_item_ids = dicts[2]
    
    # Save candidate pools
    mention_candidate_pools_page_ids.append(candidate_pool_page_ids)
    mention_candidate_pools_item_ids.append(candidate_pool_item_ids)
    mention_candidate_pools_titles.append(candidate_pool_titles)
    
preds_anchor_popularity['mention_candidate_pools_page_ids'] = mention_candidate_pools_page_ids
preds_anchor_popularity['mention_candidate_pools_item_ids'] = mention_candidate_pools_item_ids
preds_anchor_popularity['candidate_pools_titles'] = mention_candidate_pools_titles

100%|██████████| 29312/29312 [00:00<00:00, 159631.97it/s]


In [52]:
print(f"We received {oov_error:,} Out-of-Vocabulary Errors.")

We received 4,625 Out-of-Vocabulary Errors.


In [53]:
# Preview dataframe
preds_anchor_popularity.head(3)

Unnamed: 0,mention,full_mention,wikipedia_URL,wikipedia_page_ID,wikipedia_title,sentence_id,doc_id,congruent_mentions,norm_full_mention,mention_candidate_pools_page_ids,mention_candidate_pools_item_ids,candidate_pools_titles
0,B,EU,,,,0,0,"['EU', 'German', 'British']",eu,"[9317, 9239, 9891, 9472, 10890716, 2780146, 18...","[458, 46, 45003, 4916, 185441, 932442, 8268, 8...","[European_Union, Europe, Entropy, Euro, Member..."
1,B,German,http://en.wikipedia.org/wiki/Germany,11867.0,Germany,0,0,"['EU', 'German', 'British']",german,"[11867, 11867, 27318, 21148, 21212, 21212, 269...","[183, 183, 334, 55, 7318, 7318, 40, 12548, 825...","[Germany, Germany, Singapore, Netherlands, Naz..."
2,B,British,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,United Kingdom,0,0,"['EU', 'German', 'British']",british,"[3434750, 31717, 31717, 19344654, 26061, 85699...","[30, 145, 145, 9531, 172771, 1860, 21, 22, 868...","[United_States, United_Kingdom, United_Kingdom..."


In [54]:
# Calculate accuracy
accurate_predictions = 0
for i in range(len(preds_anchor_popularity)):
    try:
        if preds_anchor_popularity['wikipedia_page_ID'][i] == preds_anchor_popularity['mention_candidate_pools_page_ids'][i][0]:
            accurate_predictions += 1
    except TypeError:
        pass
print("****************************")
print(f"Predictive Accuracy: {round(accurate_predictions / len(preds_anchor_popularity) * 100, 3)}%")
print("****************************")

****************************
Predictive Accuracy: 46.571%
****************************


In [55]:
# Calculate percentage of candidate pools with the correct answer present
# Necessary to determine if shuffling pool could even get the right answer
response_present = 0
for i in range(len(preds_anchor_popularity)):
    try:
        if preds_anchor_popularity['wikipedia_page_ID'][i] in preds_anchor_popularity['mention_candidate_pools_page_ids'][i]:
            response_present += 1
    except TypeError:
        pass
print(f"Correct answer is present in {round(response_present / len(preds_anchor_popularity) * 100, 3)}% of generated candidate pools via Anchor Links popularity method.")

Correct answer is present in 67.535% of generated candidate pools via Anchor Links popularity method.


In [56]:
# Save candidate pools dataframe
preds_anchor_popularity.to_csv(os.path.join(preds_path, "anchortext_popularity.csv"), index=False)