In [1]:
# Import necessary packages
# Ensure installation of nltk package in conda environment
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Add directory above current directory to path so we can import our pre-built package
import sys; sys.path.insert(0, '../..')
from entity_disambiguation.preprocessing import process_input, normalize_text

from nltk.corpus import stopwords

from tqdm import tqdm

In [2]:
try: 
    stop = stopwords.words('english')
except LookupError:
    # Download stopwords if it's your first time
    import nltk
    nltk.download('stopwords')

## 1. Process Aida-Conll-Yago (ACY) dataset into train/test split

In [3]:
# relative path to ACY dataset, should be the same for all users within `entity-disambiguation` repository
acy_path = '../../data/aida-conll-yago-dataset/'

# process_input() takes path location where you've stored AIDA-YAGO2-DATASET.tsv file
# Transforms tsv file into train, test split
# x are words, y are indices
train_x, train_y, test_x, test_y = process_input(acy_path, train = .8)

In [4]:
# Display shape of output train data
display((train_x.shape, train_y.shape))
display((test_x.shape, test_y.shape))

((17805, 2), (17805,))

((4452, 2), (4452,))

In [5]:
# Display x preview
train_x.head(3)

Unnamed: 0,token,in_between_word_count
0,german,2
1,british,3
2,brussels,6


In [6]:
# Display y preview
train_y[0:3]

array(['11867', '31717', '3708'], dtype='<U8')

In [7]:
# Save train x/y into numpy arrays
x_np = train_x.token.values
y_np = np.array(train_y, dtype = 'int64')

## 2. Develop Baseline models:
### (i) anchor links and (ii) page views
Load `KWNLP` dataset to create our baseline model, i.e. calculate the anchor linkage statistics for Wikipedia pages. Then we apply that to the ACY dataset in the next section.

**TODO Note:** These baselines so far only consider *exact* matches between a mention in ACY and an anchor text in KWNLP.

In [8]:
# Provide directory path for KWNLP data
# Should be same for all users of `entity-disambiguation` repository
kwnlp_path = '../../data/kwnlp'

# Load article data
article_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-article.csv'))

# Load anchor target counts data
anchor_df = pd.read_csv(os.path.join(kwnlp_path, 'kwnlp-enwiki-20200920-anchor-target-counts.csv'))

In [9]:
# Display article preview
article_df.head()

Unnamed: 0,page_id,item_id,page_title,views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,12,6199,Anarchism,35558,40449,409,3826,371,1,0,0,0,0,0,0
1,25,38404,Autism,40081,47659,419,2313,309,0,1,0,0,0,0,0
2,39,101038,Albedo,10770,18766,293,3090,115,0,0,0,0,0,0,0
3,290,9659,A,29398,9538,609,173,149,0,0,0,0,0,0,0
4,303,173,Alabama,46680,74276,369,11864,744,0,0,0,0,0,0,0


In [10]:
# Display anchor preview
anchor_df.head()

Unnamed: 0,anchor_text,target_page_id,count
0,United States,3434750,152451
1,World War II,32927,133668
2,India,14533,112069
3,France,5843419,109669
4,footballer,10568,101027


In [11]:
# Copy anchor_df to new dataframe
at_count_df = anchor_df.copy()

# Normalize anchor_text (lower-case, strip whitespace)
at_count_df["normalized_anchor_text"] = at_count_df["anchor_text"].apply(normalize_text)

# Return all anchor_texts that are non-zero, non-null
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'].str.len() > 0, :]

print(len(at_count_df))
at_count_df.head(3)

15269229


Unnamed: 0,anchor_text,target_page_id,count,normalized_anchor_text
0,United States,3434750,152451,united states
1,World War II,32927,133668,world war ii
2,India,14533,112069,india


Inner join anchor-target data (mention to linked entity) with Wikipedia page article data. This lets us collate stats like page views with the target (entity) of mentions. Page views serves as another baseline model by selecting the page/entity that is most viewed for that anchor text.

In [12]:
# Merge at_count and article stats dataframes
at_count_df = pd.merge(
    at_count_df,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")

# Rename columns for clarity
at_count_df = at_count_df.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})

# Specify column ordering
at_count_df = at_count_df[[
    "normalized_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]

# Display preview
at_count_df.head(3)

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559


Drop NaNs. These have been encoded as `string` through the text normalisation previously.

In [13]:
# Drop NaNs
len_orig = len(at_count_df)
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'] != 'nan']
print('Dropped rows:', len_orig - len(at_count_df))

Dropped rows: 3596


### i. Anchor Links
`pandas` `merge` and `join` can't be used with indices with duplicate values, as it automatically sorts and that can't be undone confidently with duplicates.

In [14]:
%%time
# Sort all anchor links by anchor text and then target count
max_anchor_links = at_count_df.sort_values(['normalized_anchor_text', 'anchor_target_count'], ascending = False)
# Keep just most common value (top value after sort)
max_anchor_links.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)
# Update index after drops
max_anchor_links.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_anchor_links.head(3))
print('Removed {} mentions with targets with lower anchor counts'.format(len(at_count_df)-len(max_anchor_links)))
assert len(max_anchor_links) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


Removed 3936693 mentions with targets with lower anchor counts
CPU times: user 1min 47s, sys: 35.5 s, total: 2min 22s
Wall time: 2min 54s


In [15]:
# Create text-to-page-to-title dataframe (as predictions)
preds_anchor = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

# tqdm is a progress bar package
# For all anchor texts, update with predictions
for i in tqdm(range(len(preds_anchor))):
    try:
        preds_anchor.iloc[i, 1:3] = max_anchor_links.loc[preds_anchor.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 17805/17805 [00:37<00:00, 479.23it/s] 


In [16]:
# Display entity disambiguation predictions using anchor link statistics
preds_anchor

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,31717,United_Kingdom
2,brussels,3708,Brussels
3,european,13279542,Ethnic_groups_in_Europe
4,commission,60925,Ship_commissioning
...,...,...,...
17800,michael,318621,Michael_(archangel)
17801,johnson,2327783,Johnson_(composer)
17802,u.s.,3434750,United_States
17803,ato,1769301,Automatic_train_operation


### ii. Page Views


In [17]:
# Display dataframe preview
at_count_df

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559
3,u.s.,3434750,30,United_States,460156,7633
4,us,3434750,30,United_States,460156,5288
...,...,...,...,...,...,...
15269224,garfield lake,47208504,20707378,Garfield_Lake,10,1
15269225,aert van der goes,38402950,381796,Aert_van_der_Goes,18,1
15269226,chimanbhai mehta,65368350,-1,Chimanbhai_Mehta,0,1
15269227,urmilaben chimanbhai patel,55927372,44128896,Urmilaben_Chimanbhai_Patel,9,1


In [18]:
%%time
# Sort dataframe by anchor text and page views, remove duplicates except top (most popular) and update index
max_page_views = at_count_df.sort_values(['normalized_anchor_text', 'target_page_views'], ascending = False)
max_page_views.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)
max_page_views.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_page_views.head(3))
assert len(max_page_views) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


CPU times: user 1min 42s, sys: 24.4 s, total: 2min 7s
Wall time: 2min 25s


In [19]:
# Create prediction dataframe with most popular views
preds_page = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

# tqdm is progress bar
# For every anchor text, append prediction using page views
for i in tqdm(range(len(preds_page))):
    try:
        preds_page.iloc[i, 1:3] = max_page_views.loc[preds_page.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 17805/17805 [00:41<00:00, 430.79it/s] 


In [20]:
# Display predictions preview
preds_page

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,3434750,United_States
2,brussels,5843419,France
3,european,44220,UEFA_Champions_League
4,commission,21875,Nuremberg_trials
...,...,...,...
17800,michael,20455,Michael_Jordan
17801,johnson,54533,Lyndon_B._Johnson
17802,u.s.,3434750,United_States
17803,ato,42563745,War_in_Donbass


## Scoring

In [21]:
print('-- Accuracy on entire ACY dataset using KWNLP --')
print('Anchor Linking: {}%'.format(np.round(100*np.mean(preds_anchor.entity_page_id == y_np), 2)))
print('Page Views: {}%'.format(np.round(100*np.mean(preds_page.entity_page_id == y_np), 2)))


-- Accuracy on entire ACY dataset using KWNLP --
Anchor Linking: 33.14%
Page Views: 29.31%


**TODO:** Currently composite mentions in ACY are split into separate mentions as part of `preprocessing`. This probably contributes to lowering accuracy, as e.g. "United States" becomes "united" and "states", two separate mentions to be linked.