In [31]:
import sys

import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
%matplotlib inline
#downloading wikipedia2vec failed in my case
from nltk.corpus import stopwords

## load in anchor stats

In [2]:
my_path = '../../kwnlp/'
article_df = pd.read_csv(os.path.join(my_path, 'article.csv'))
article_df.head()

Unnamed: 0,page_id,item_id,page_title,views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,12,6199,Anarchism,35558,40449,409,3826,371,1,0,0,0,0,0,0
1,25,38404,Autism,40081,47659,419,2313,309,0,1,0,0,0,0,0
2,39,101038,Albedo,10770,18766,293,3090,115,0,0,0,0,0,0,0
3,290,9659,A,29398,9538,609,173,149,0,0,0,0,0,0,0
4,303,173,Alabama,46680,74276,369,11864,744,0,0,0,0,0,0,0


In [3]:
anchor_df = pd.read_csv(os.path.join(my_path, 'anchor_target_counts.csv'))
anchor_df.head()

Unnamed: 0,anchor_text,target_page_id,count
0,United States,3434750,152451
1,World War II,32927,133668
2,India,14533,112069
3,France,5843419,109669
4,footballer,10568,101027


In [4]:
at_count_df = anchor_df.copy()
def text_normalizer(text):                              
    """Return text after stripping external whitespace and lower casing."""   
    return str(text).strip().lower()

at_count_df["normalized_anchor_text"] = at_count_df["anchor_text"].apply(text_normalizer)
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'].str.len() > 0, :]
at_count_df

Unnamed: 0,anchor_text,target_page_id,count,normalized_anchor_text
0,United States,3434750,152451,united states
1,World War II,32927,133668,world war ii
2,India,14533,112069,india
3,France,5843419,109669,france
4,footballer,10568,101027,footballer
...,...,...,...,...
15269224,Sheldon,1299850,1,sheldon
15269225,Korunamoyee Rani Rashmoni,59216176,1,korunamoyee rani rashmoni
15269226,Musik-Lexikon,64672172,1,musik-lexikon
15269227,Su-27,2733709,1,su-27


In [5]:
at_count_df = pd.merge(
    at_count_df,
    article_df,
    how="inner",
    left_on="target_page_id",
    right_on="page_id")
at_count_df = at_count_df.rename(columns={
    'title': 'target_page_title',
    'item_id': 'target_item_id',
    'views': 'target_page_views',
    'count': 'anchor_target_count',
    'page_title': 'target_page_title'})
at_count_df.head()

Unnamed: 0,anchor_text,target_page_id,anchor_target_count,normalized_anchor_text,page_id,target_item_id,target_page_title,target_page_views,len_article_chars,len_intro_chars,in_link_count,out_link_count,tmpl_good_article,tmpl_featured_article,tmpl_pseudoscience,tmpl_conspiracy_theories,isa_Q17442446,isa_Q14795564,isa_Q18340514
0,United States,3434750,152451,united states,3434750,30,United_States,460156,78654,527,250765,1144,0,0,0,0,0,0,0
1,American,3434750,65722,american,3434750,30,United_States,460156,78654,527,250765,1144,0,0,0,0,0,0,0
2,USA,3434750,8559,usa,3434750,30,United_States,460156,78654,527,250765,1144,0,0,0,0,0,0,0
3,U.S.,3434750,7633,u.s.,3434750,30,United_States,460156,78654,527,250765,1144,0,0,0,0,0,0,0
4,US,3434750,5288,us,3434750,30,United_States,460156,78654,527,250765,1144,0,0,0,0,0,0,0


In [6]:
at_count_df = at_count_df[[
    "normalized_anchor_text",
    "target_page_id",
    "target_item_id",
    "target_page_title",
    "target_page_views",
    "anchor_target_count"]]
at_count_df.head()

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count
0,united states,3434750,30,United_States,460156,152451
1,american,3434750,30,United_States,460156,65722
2,usa,3434750,30,United_States,460156,8559
3,u.s.,3434750,30,United_States,460156,7633
4,us,3434750,30,United_States,460156,5288


In [7]:
#Save this csv for future use
at_count_df.to_csv(os.path.join(my_path, 'count.csv'), index = False)

In [21]:
at_count_df = pd.read_csv(os.path.join(my_path, 'count.csv'))

## Load in test data

In [8]:
import csv
# df_acy = dd.read_csv('../../aida-conll-yago-dataset/AIDA-YAGO2-DATASET.tsv', sep='\t',dtype='object').compute()
# res = df.infer_objects()

tsv_file = open('../../aida-conll-yago-dataset/AIDA-YAGO2-DATASET.tsv')
read_tsv = csv.reader(tsv_file, delimiter="\t")
df = []
for row in read_tsv:
    df.append(row)

In [9]:
acy_df = pd.DataFrame(data = df[1:])
new = ['token', 'mention', 'full_mention', 'YAGO2', 'wikipedia_URL', 'wikipedia_ID', 'freebase']
acy_df = acy_df.rename(columns = dict(zip(range(7), new)))
acy_df.head()

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase
0,EU,B,EU,--NME--,,,
1,rejects,,,,,,
2,German,B,German,Germany,http://en.wikipedia.org/wiki/Germany,11867.0,/m/0345h
3,call,,,,,,
4,to,,,,,,


In [10]:
#split into 80% train and 20% test
#test has to be the last 20% since text data is consistent
train = acy_df.copy()[:int(0.8*len(acy_df))]
test = acy_df.copy()[int(0.8*len(acy_df)):]
test['token'] = [str(i) for i in test.token]
test = test[test.token!='None']

In [19]:
X_train = list(train['token'])
y_train = list(train['wikipedia_ID']) #same as target_page_id

X_test = list(test['token'])
y_test = list(test['wikipedia_ID'])

In [14]:
len(test)

33172

## Baseline Model

Basic idea: If this mention has occured in normalized_anchor_text, we map it to the wikipedia entity with highest anchor_target_count

In [16]:
#create some lists so that we don't have to reconstruct them again throughout the loops
anchor_texts = list(at_count_df['normalized_anchor_text'])
anchor_texts2 = ' '.join(anchor_texts).split(' ')
anchor_target_count = list(at_count_df['anchor_target_count'])
page_id = list(at_count_df['target_page_id'])

In [35]:
#We need to remove stopwords and some common punctuations
stop = stopwords.words('english')
print(stopwords.words('english'))
punc = ['!', '"', '#', '%', '&', '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';', '<', '=', 
        '>', '?', '@', '[', '\\', ']', '^', '_', '`', '\{', '\|', '\}', '\~']

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [36]:
def baseline_predict(tokens):
    predictions = []
    for i in range(len(tokens)):
        if i%100 == 0:
            print('at {}..'.format(i))
        #first normalize
        t = tokens[i].lower()
        #check if it's in stopwords
        if t in stop or t in punc:
            predictions.append(None)
        #match to anchor texts
        elif t in anchor_texts:
            target_df = at_count_df[at_count_df['normalized_anchor_text']==t]
            if (len(target_df)>1):
                #pick the one with highest anchor_target_count
                am = np.argmax(np.array(target_df['anchor_target_count']))
                result = list(target_df['target_page_id'])[am]
                predictions.append(result)
            else:
                predictions.append(list(target_df.target_page_id)[0])
        #if it doesn't match, check if it's a beginning of mention
        elif t in anchor_texts2:
            #some candidates
            can_i = []
            len_i = []
            for j in range(len(anchor_texts)):
                if anchor_texts[j].split(' ')[0] == t:
                    #check if it matches full mention
                    if anchor_texts[j]==' '.join(tokens[i:i+len(anchor_texts[j].split(' '))]).lower().strip():
                        #if so, append to candidates
                        can_i.append(j)
                        len_i.append(len(anchor_texts[j].split(' ')))
            if len(can_i)==0:
                predictions.append(None)
            else:
                #these candidates' anchor_target_count
                tar_i = [anchor_target_count[x] for x in can_i]
                am = np.argmax(np.array(tar_i))
                #get target_page_id
                res = page_id[can_i[am]]
                #append to predictions
                predictions += [res]*len_i[am]
                #update i
                i += len_i[am]
        else:
            predictions.append(None)
    return predictions

In [37]:
#only 1000 because it takes a long time
y_pred = baseline_predict(X_test[:1000])

at 0..
at 100..
at 200..
at 300..
at 400..
at 500..
at 600..
at 700..
at 800..
at 900..


In [39]:
len(y_pred)

1000

In [38]:
#accuracy
acc = sum([y_pred[i]==y_test[i] for i in range(len(y_pred))])/len(y_pred)
acc

0.446

In [45]:
num_nones = sum([(y_pred[i]==None and y_test[i]!=None) for i in range(len(y_pred))])
print('Should have some value but was classified as NULL: ', num_nones)
num_unnones = sum([(y_pred[i]!=None and y_test[i]==None) for i in range(len(y_pred))])
print('Should be NULL but was misclassified: ', num_unnones)

Should have some value but was classified as NULL:  1
Should be NULL but was misclassified:  494
