# Basic Pipeline

Our basic pipeline consists of using **Anchor Link** statistics to link an entity mention in the dataset ACY to a Wikipedia page ID in the Kensho-derived Knowledge Graph.

In [1]:
# Import necessary packages
# Ensure installation of nltk package in conda environment
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Add directory above current directory to path so we can import our pre-built package
import sys; sys.path.insert(0, '../..')
from entity_disambiguation.preprocessing import process_input, normalize_text

from nltk.corpus import stopwords

from tqdm import tqdm

In [2]:
try: 
    stop = stopwords.words('english')
except LookupError:
    # Download stopwords if it's your first time
    import nltk
    nltk.download('stopwords')

## 1. Process Aida-Conll-Yago (ACY) dataset into train/test split

We import ACY, provide the ability to split it into train and test (though don't use this yet) and normalize full_mention to lower cases with spaces.

In [31]:
# relative path to ACY dataset, should be the same for all users within `entity-disambiguation` repository
acy_path = '../../data/aida-conll-yago-dataset/'

# process_input() takes path location where you've stored AIDA-YAGO2-DATASET.tsv file
# Transforms tsv file into train, test split
# x are words, y are indices
train_x, train_y, test_x, test_y = process_input(acy_path, match="full_mention", train = 1.)

In [32]:
# Display shape of output train data
display((train_x.shape, train_y.shape))
display((test_x.shape, test_y.shape))

((22257, 2), (22257,))

((0, 2), (0,))

In [33]:
# Display x preview
train_x.head(3)

Unnamed: 0,full_mention,in_between_word_count
0,German,2
1,British,3
2,BRUSSELS,6


In [34]:
# Display y preview
train_y[0:3]

array(['11867', '31717', '3708'], dtype='<U8')

In [39]:
# Save train x/y into numpy arrays
x_np = train_x.full_mention.values
x_np = np.array([x.lower() if x is not None else x for x in x_np])
y_np = np.array(train_y, dtype = 'int64')

In [42]:
# Preview x_np
x_np[:5], y_np[:5]

(array(['german', 'british', 'brussels', 'european commission',
        'european commission'], dtype='<U51'),
 array([11867, 31717,  3708,  9974,  9974]))

## Build Kensho Target Dataset

Load `KWNLP` dataset to create our baseline model, i.e. calculate the anchor linkage statistics for Wikipedia pages. Then we apply that to the ACY dataset in the next section.

These baselines so far only consider *exact* matches between a full mention in ACY and an anchor text in KWNLP.

In [8]:
# Provide directory path for KWNLP data
# Should be same for all users of `entity-disambiguation` repository
kwnlp_path = '../../data/kwnlp'

# Load article data
article_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-article.csv'))

# Load anchor target counts data
anchor_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-anchor-target-counts.csv'))

In [9]:
# Display article preview
article_df.head()

Unnamed: 0,page_id,item_id,page_title,views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,12,6199,Anarchism,35558,40449,409,3826,371,1,0,0,0,0,0,0
1,25,38404,Autism,40081,47659,419,2313,309,0,1,0,0,0,0,0
2,39,101038,Albedo,10770,18766,293,3090,115,0,0,0,0,0,0,0
3,290,9659,A,29398,9538,609,173,149,0,0,0,0,0,0,0
4,303,173,Alabama,46680,74276,369,11864,744,0,0,0,0,0,0,0


In [10]:
# Display anchor preview
anchor_df.head()

Unnamed: 0,anchor_text,target_page_id,count
0,United States,3434750,152451
1,World War II,32927,133668
2,India,14533,112069
3,France,5843419,109669
4,footballer,10568,101027


In [12]:
# Copy anchor_df to new dataframe
at_count_df = anchor_df.copy()

# Normalize anchor_text (lower-case, strip whitespace)
at_count_df["normalized_anchor_text"] = at_count_df["anchor_text"].apply(normalize_text)

# Return all anchor_texts that are non-zero, non-null
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'].str.len() > 0, :]

print(len(at_count_df))
at_count_df.head(3)

15269229


Unnamed: 0,anchor_text,target_page_id,count,normalized_anchor_text
0,United States,3434750,152451,united states
1,World War II,32927,133668,world war ii
2,India,14533,112069,india


Inner join anchor-target data (mention to linked entity) with Wikipedia page article data. This lets us collate stats like page views with the target (entity) of mentions. Page views serves as another baseline model by selecting the page/entity that is most viewed for that anchor text.

In [13]:
# Merge at_count and article stats dataframes
at_count_df = pd.merge(
    at_count_df,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")

# Rename columns for clarity
at_count_df = at_count_df.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})

# Specify column ordering
at_count_df = at_count_df[[
    "normalized_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]

# Display preview
at_count_df.head(3)

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559


Drop NaNs. These have been encoded as `string` through the text normalisation previously.

In [14]:
# Drop NaNs
len_orig = len(at_count_df)
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'] != 'nan']
print('Dropped rows:', len_orig - len(at_count_df))

Dropped rows: 3596


This leaves us with our final target baseline. We will now select a page_id for each normalized_anchor_text based on target link count or page views and then try to join full_mention with normalized_anchor_text.

# Develop Two Baseline Models

### i. Anchor Link Count
`pandas` `merge` and `join` can't be used with indices with duplicate values, as it automatically sorts and that can't be undone confidently with duplicates.

In [15]:
%%time
# Sort all anchor links by anchor text and then target count
max_anchor_links = at_count_df.sort_values(['normalized_anchor_text', 'anchor_target_count'], ascending = False)
# Keep just most common value (top value after sort)
max_anchor_links.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)
# Update index after drops
max_anchor_links.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_anchor_links.head(3))
print('Removed {} mentions with targets with lower anchor counts'.format(len(at_count_df)-len(max_anchor_links)))
assert len(max_anchor_links) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


Removed 3936693 mentions with targets with lower anchor counts
CPU times: user 1min 49s, sys: 10.3 s, total: 2min
Wall time: 2min 3s


In [19]:
# Preview 10 random rows
random_int = np.random.randint(len(max_anchor_links)) # Useful example: 4395177
display(max_anchor_links[random_int:random_int+10][['target_page_id', 'target_page_title', 'anchor_target_count']])

Unnamed: 0_level_0,target_page_id,target_page_title,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
morgan simon,60680011,Morgan_Simon,1
morgan simmons,26555995,Morgan_Simmons,1
morgan silver dollars,2068555,Morgan_dollar,1
morgan silver dollar,2068555,Morgan_dollar,4
morgan shuster,1570302,William_Morgan_Shuster,11
morgan sheppard,8967390,W._Morgan_Sheppard,4
morgan shepherd,1201512,Morgan_Shepherd,541
morgan shepard,32884535,John_Martin's_Book,4
morgan seth earp,812231,Morgan_Earp,1
morgan scroggy,28458683,Morgan_Scroggy,2


In [20]:
# Print previous random_int in case you want to explore further
print(random_int)

1256827


The above is our linkage of text to Wikipedia page based on link count.

To create our predictions, we initialize a new DataFrame with our inputs x_np, then for each input (full_mention), search max_anchor_links for matching anchor text string and add the associated page_id and target_page_title. 

In [44]:
# Create text-to-page-to-title dataframe (as predictions)
preds_anchor = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

# tqdm is a progress bar package
# For all anchor texts, update with predictions
for i in tqdm(range(len(preds_anchor))):
    try:
        preds_anchor.iloc[i, 1:3] = max_anchor_links.loc[preds_anchor.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 22257/22257 [00:20<00:00, 1079.35it/s]


In [45]:
# Display entity disambiguation predictions using anchor link statistics
preds_anchor

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,31717,United_Kingdom
2,brussels,3708,Brussels
3,european commission,9974,European_Commission
4,european commission,9974,European_Commission
...,...,...,...
22252,england,9316,England
22253,1966 world cup,61629,1966_FIFA_World_Cup
22254,1966 world cup,61629,1966_FIFA_World_Cup
22255,1966 world cup,61629,1966_FIFA_World_Cup


### ii. Page Views


In [23]:
# Display dataframe preview
at_count_df

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559
3,u.s.,3434750,30,United_States,460156,7633
4,us,3434750,30,United_States,460156,5288
...,...,...,...,...,...,...
15269224,garfield lake,47208504,20707378,Garfield_Lake,10,1
15269225,aert van der goes,38402950,381796,Aert_van_der_Goes,18,1
15269226,chimanbhai mehta,65368350,-1,Chimanbhai_Mehta,0,1
15269227,urmilaben chimanbhai patel,55927372,44128896,Urmilaben_Chimanbhai_Patel,9,1


Starting with the same dataframe as before, we now sort by page views and take the top result by page views.

In [24]:
%%time
# Sort dataframe by anchor text and page views, remove duplicates except top (most popular) and update index
max_page_views = at_count_df.sort_values(['normalized_anchor_text', 'target_page_views'], ascending = False)
max_page_views.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)
max_page_views.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_page_views.head(3))
assert len(max_page_views) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


CPU times: user 1min 56s, sys: 14.8 s, total: 2min 11s
Wall time: 2min 17s


In [46]:
# Create prediction dataframe with most popular views
preds_page = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

# tqdm is progress bar
# For every anchor text, append prediction using page views
for i in tqdm(range(len(preds_page))):
    try:
        preds_page.iloc[i, 1:3] = max_page_views.loc[preds_page.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 22257/22257 [00:20<00:00, 1108.20it/s]


In [47]:
# Display predictions preview
preds_page

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,3434750,United_States
2,brussels,5843419,France
3,european commission,9974,European_Commission
4,european commission,9974,European_Commission
...,...,...,...
22252,england,31717,United_Kingdom
22253,1966 world cup,61629,1966_FIFA_World_Cup
22254,1966 world cup,61629,1966_FIFA_World_Cup
22255,1966 world cup,61629,1966_FIFA_World_Cup


## Scoring

In [48]:
# Accuracy matching full mention with anchor text
print('-- Accuracy on entire ACY dataset using KWNLP --')
print('Anchor Linking: {}%'.format(np.round(100*np.mean(preds_anchor.entity_page_id == y_np), 2)))
print('Page Views: {}%'.format(np.round(100*np.mean(preds_page.entity_page_id == y_np), 2)))

-- Accuracy on entire ACY dataset using KWNLP --
Anchor Linking: 71.92%
Page Views: 61.33%


In [27]:
# Accuracy matching single words
print('-- Accuracy on entire ACY dataset using KWNLP --')
print('Anchor Linking: {}%'.format(np.round(100*np.mean(preds_anchor.entity_page_id == y_np), 2)))
print('Page Views: {}%'.format(np.round(100*np.mean(preds_page.entity_page_id == y_np), 2)))


-- Accuracy on entire ACY dataset using KWNLP --
Anchor Linking: 33.14%
Page Views: 29.31%


If we match just tokens, we see:

Anchor Linking: 33.14%

Page Views: 29.31%

If we match full_mention, we see:

Anchor Linking: 71.92%

Page Views: 61.33%