In [1]:
import sys

import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
%matplotlib inline

## aida-conll-yago-dataset

Data Description

A dataset for named entity recognition and disambiguation (NERD), 

> File Format
> -----------
>
> The format of the final file is the following:
>
> - Each document starts with a line: -DOCSTART- (<docid>)
> - Each following line represents a single token, sentences are separated by an empty line
>   
> Lines with tabs are tokens the are part of a mention:
> - column 1 is the token
> - column 2 is either B (beginning of a mention) or I (continuation of a mention)
> - column 3 is the full mention used to find entity candidates
> - column 4 is the corresponding YAGO2 entity (in YAGO encoding, i.e. unicode characters are backslash encoded and spaces are replaced by underscores, see also the tools on the YAGO2 website), OR --NME--, denoting that there is no matching entity in YAGO2 for this particular mention, or that we are missing the connection between the mention string and the YAGO2 entity.
> - column 5 is the corresponding Wikipedia URL of the entity (added for convenience when evaluating against a Wikipedia based method)
> - column 6 is the corresponding Wikipedia ID of the entity (added for convenience when evaluating against a Wikipedia based method - the ID refers to the dump used for annotation, 2010-08-17)
> - column 7 is the corresponding Freebase mid, if there is one (thanks to Massimiliano Ciaramita from Google Zürich for creating the mapping and making it available to us)



In [3]:
import csv
# df_acy = dd.read_csv('../../aida-conll-yago-dataset/AIDA-YAGO2-DATASET.tsv', sep='\t',dtype='object').compute()
# res = df.infer_objects()

tsv_file = open('../../data/aida-conll-yago-dataset/AIDA-YAGO2-DATASET.tsv')
read_tsv = csv.reader(tsv_file, delimiter="\t")
df = []
for row in read_tsv:
    df.append(row)

In [4]:
len(df[1])

4

**Note:** `wikipedia_ID` in ACY corresponds to `page_id` in KWNLP.

In [41]:
acy_df = pd.DataFrame(data = df[1:])
new = ['token', 'mention', 'full_mention', 'YAGO2', 'wikipedia_URL', 'wikipedia_ID', 'freebase']
acy_df = acy_df.rename(columns = dict(zip(range(7), new)))
acy_df.head(50)

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase
0,EU,B,EU,--NME--,,,
1,rejects,,,,,,
2,German,B,German,Germany,http://en.wikipedia.org/wiki/Germany,11867.0,/m/0345h
3,call,,,,,,
4,to,,,,,,
5,boycott,,,,,,
6,British,B,British,United_Kingdom,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,/m/07ssc
7,lamb,,,,,,
8,.,,,,,,
9,,,,,,,


In [39]:
# Display dataframe with only full_mention values != None
acy_df[acy_df['full_mention'].notna()]

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase
0,EU,B,EU,--NME--,,,
2,German,B,German,Germany,http://en.wikipedia.org/wiki/Germany,11867,/m/0345h
6,British,B,British,United_Kingdom,http://en.wikipedia.org/wiki/United_Kingdom,31717,/m/07ssc
10,Peter,B,Peter Blackburn,--NME--,,,
11,Blackburn,I,Peter Blackburn,--NME--,,,
...,...,...,...,...,...,...,...
176583,England,B,England,England_national_football_team,http://en.wikipedia.org/wiki/England_national_...,9904,/m/02pp1
176602,1966,B,1966 World Cup,1966_FIFA_World_Cup,http://en.wikipedia.org/wiki/1966_FIFA_World_Cup,61629,/m/0gpjr
176603,World,I,1966 World Cup,1966_FIFA_World_Cup,http://en.wikipedia.org/wiki/1966_FIFA_World_Cup,61629,/m/0gpjr
176604,Cup,I,1966 World Cup,1966_FIFA_World_Cup,http://en.wikipedia.org/wiki/1966_FIFA_World_Cup,61629,/m/0gpjr


In [40]:
# Display dataframe with only full_mention values != None
acy_df[acy_df['full_mention'].isna()]

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase
1,rejects,,,,,,
3,call,,,,,,
4,to,,,,,,
5,boycott,,,,,,
7,lamb,,,,,,
...,...,...,...,...,...,...,...
176609,younger,,,,,,
176610,brother,,,,,,
176611,",",,,,,,
176613,.,,,,,,


In [7]:
len(acy_df)

176615

In [8]:
print('{:.2f}% of them had a full mention matched'.format(sum([i!=None for i in acy_df.iloc[:, 2]])/len(acy_df)*100))


16.60% of them had a full mention matched


In [9]:
print('{:.2f}% of them had a yago2 entity matched'.format(sum([(i!=None and i!='--NME--') for i in acy_df.iloc[:, 3]])/len(acy_df)*100))

12.60% of them had a yago2 entity matched


In [10]:
print('{:.2f}% of them had a yago2 entity matched'.format(sum([(i!=None and i!='--NME--') for i in acy_df.iloc[:, 3]])/len(acy_df)*100))

12.60% of them had a yago2 entity matched


In [11]:
print('{:.2f}% of them had a wikipedia page matched'.format(sum([i!=None for i in acy_df.iloc[:, 4]])/len(acy_df)*100))

12.60% of them had a wikipedia page matched


In [12]:
print('{:.2f}% of them had a freebase mid matched'.format(sum([i!=None for i in acy_df.iloc[:, 6]])/len(acy_df)*100))

12.60% of them had a freebase mid matched
