In [1]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Add directory above current directory to path so we can import our pre-built package
import sys; sys.path.insert(0, '../..')
from entity_disambiguation.preprocessing import process_input, normalize_text

from nltk.corpus import stopwords

from tqdm import tqdm

In [2]:
try: 
    stop = stopwords.words('english')
except LookupError:
    # Download stopwords if it's your first time
    import nltk
    nltk.download('stopwords')

## 1. Process input

In [3]:
# relative path to ACY dataset, should be the same for all users
acy_path = '../../data/aida-conll-yago-dataset/'

train_x, train_y, test_x, test_y = process_input(acy_path, train = 1.0)

In [4]:
display((train_x.shape, train_y.shape))
display((test_x.shape, test_y.shape))

((22257, 2), (22257,))

((0, 2), (0,))

In [5]:
train_x.head(3)

Unnamed: 0,token,in_between_word_count
0,german,2
1,british,3
2,brussels,6


In [6]:
train_y[0:3]

array(['11867', '31717', '3708'], dtype='<U8')

In [7]:
x_np = train_x.token.values
y_np = np.array(train_y, dtype = 'int64')

## 2. Baseline models: (i) anchor links and (ii) page views
Load `KWNLP` dataset to create our baseline model, i.e. calculate the anchor linkage statistics for Wikipedia pages. Then we apply that to the ACY dataset in the next section.

**Note:** These baselines so far only consider *exact* matches between a mention in ACY and an anchor text in KWNLP.

In [8]:
kwnlp_path = '../../data/kwnlp'

article_df = pd.read_csv(os.path.join(kwnlp_path, 'article.csv'))

anchor_df = pd.read_csv(os.path.join(kwnlp_path, 'anchor-target-counts.csv'))

In [9]:
at_count_df = anchor_df.copy()

at_count_df["normalized_anchor_text"] = at_count_df["anchor_text"].apply(normalize_text)

at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'].str.len() > 0, :]

print(len(at_count_df))
at_count_df.head(3)

15269229


Unnamed: 0,anchor_text,target_page_id,count,normalized_anchor_text
0,United States,3434750,152451,united states
1,World War II,32927,133668,world war ii
2,India,14533,112069,india


Inner join anchor-target data (mention to linked entity) with Wikipedia page article data. This lets us collate stats like page views with the target (entity) of mentions.

In [10]:
at_count_df = pd.merge(
    at_count_df,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")

at_count_df = at_count_df.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})

at_count_df = at_count_df[[
    "normalized_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]

at_count_df.head(3)

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559


Drop NaNs. These have been encoded as `string` through the text normalisation previously.

In [11]:
len_orig = len(at_count_df)
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'] != 'nan']
print('Dropped rows:', len_orig - len(at_count_df))

Dropped rows: 3596


### i. Anchor Links
`pandas` `merge` and `join` can't be used with indices with duplicate values, as it automatically sorts and that can't be undone confidently with duplicates.

In [12]:
%%time
max_anchor_links = at_count_df.sort_values(['normalized_anchor_text', 'anchor_target_count'], ascending = False)
max_anchor_links.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)

max_anchor_links.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_anchor_links.head(3))
print('Removed {} mentions with targets with lower anchor counts'.format(len(at_count_df)-len(max_anchor_links)))
assert len(max_anchor_links) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


Removed 3936693 mentions with targets with lower anchor counts
CPU times: user 2min, sys: 4.67 s, total: 2min 5s
Wall time: 2min 6s


In [13]:
preds_anchor = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

for i in tqdm(range(len(preds_anchor))):
    try:
        preds_anchor.iloc[i, 1:3] = max_anchor_links.loc[preds_anchor.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 22257/22257 [00:26<00:00, 834.42it/s] 


In [14]:
preds_anchor

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,31717,United_Kingdom
2,brussels,3708,Brussels
3,european,13279542,Ethnic_groups_in_Europe
4,commission,60925,Ship_commissioning
...,...,...,...
22252,england,9316,England
22253,1966,34691,1966
22254,world,38714,World
22255,cup,603997,Norwegian_Football_Cup


### ii. Page Views


In [15]:
at_count_df

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559
3,u.s.,3434750,30,United_States,460156,7633
4,us,3434750,30,United_States,460156,5288
...,...,...,...,...,...,...
15269224,garfield lake,47208504,20707378,Garfield_Lake,10,1
15269225,aert van der goes,38402950,381796,Aert_van_der_Goes,18,1
15269226,chimanbhai mehta,65368350,-1,Chimanbhai_Mehta,0,1
15269227,urmilaben chimanbhai patel,55927372,44128896,Urmilaben_Chimanbhai_Patel,9,1


In [16]:
%%time
max_page_views = at_count_df.sort_values(['normalized_anchor_text', 'target_page_views'], ascending = False)
max_page_views.drop_duplicates('normalized_anchor_text', keep = 'first', inplace = True)

max_page_views.set_index('normalized_anchor_text', inplace = True)

# Show top rows
display(max_page_views.head(3))
assert len(max_page_views) == len(set(at_count_df.normalized_anchor_text.values))


Unnamed: 0_level_0,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
normalized_anchor_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𨳊,12274636,837751,Cantonese_profanity,4897,1
🧀,11749910,10943,Cheese,22110,1
🤙,6633641,1703272,List_of_gestures,12235,2


CPU times: user 1min 54s, sys: 4.27 s, total: 1min 58s
Wall time: 1min 58s


In [17]:
preds_page = pd.DataFrame({'mention': x_np, 
                             'entity_page_id': None,
                             'target_page_title': None
                            })

for i in tqdm(range(len(preds_page))):
    try:
        preds_page.iloc[i, 1:3] = max_page_views.loc[preds_page.iloc[i, 0], ['target_page_id', 'target_page_title']].values
    except KeyError:
        # Leave at default None values
        continue

100%|██████████| 22257/22257 [00:26<00:00, 834.85it/s] 


In [18]:
preds_page

Unnamed: 0,mention,entity_page_id,target_page_title
0,german,11867,Germany
1,british,3434750,United_States
2,brussels,5843419,France
3,european,44220,UEFA_Champions_League
4,commission,21875,Nuremberg_trials
...,...,...,...
22252,england,31717,United_Kingdom
22253,1966,298705,Batman_(TV_series)
22254,world,32927,World_War_II
22255,cup,11237,FA_Cup


## Scoring

In [19]:
print('-- Accuracy on entire ACY dataset using KWNLP --')
print('Anchor Linking: {}%'.format(np.round(100*np.mean(preds_anchor.entity_page_id == y_np), 2)))
print('Page Views: {}%'.format(np.round(100*np.mean(preds_page.entity_page_id == y_np), 2)))


-- Accuracy on entire ACY dataset using KWNLP --
Anchor Linking: 33.03%
Page Views: 29.49%


**TODO:** Currently composite mentions in ACY are split into separate mentions as part of `preprocessing`. This probably contributes to lowering accuracy, as e.g. "United States" becomes "united" and "states", two separate mentions to be linked.