In [1]:
import pandas as pd
import numpy as np
import csv
import os

Charles University master thesis on using the dataset for Named Entity Recognition and Linking: [link](https://is.cuni.cz/webapps/zzp/download/120258144/?lang=cs)

In [2]:
mydir = '../../data/aida-conll-yago-dataset/'

In [3]:
tsv_file = open(os.path.join(mydir, "AIDA-YAGO2-DATASET.tsv"))
read_tsv = csv.reader(tsv_file, delimiter="\t")
df = []
for row in read_tsv:
    df.append(row)
acy_df = pd.DataFrame(data=df[1:])
new = ["token", "mention", "full_mention", "YAGO2", "wikipedia_URL", "wikipedia_ID", "freebase"]
acy_df = acy_df.rename(columns=dict(zip(range(7), new)))

In [4]:
acy_df.head(15)

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase
0,EU,B,EU,--NME--,,,
1,rejects,,,,,,
2,German,B,German,Germany,http://en.wikipedia.org/wiki/Germany,11867.0,/m/0345h
3,call,,,,,,
4,to,,,,,,
5,boycott,,,,,,
6,British,B,British,United_Kingdom,http://en.wikipedia.org/wiki/United_Kingdom,31717.0,/m/07ssc
7,lamb,,,,,,
8,.,,,,,,
9,,,,,,,


In [5]:
len(acy_df)

176615

Get the sentence number (textual unit, e.g. sentence, paragraph) for each row, using the fact that the token `None` is (nulltype) used as end of sentence marker

In [6]:
acy_df['unit_number'] = acy_df.token.isnull().cumsum()

In [7]:
acy_df[['token', 'full_mention', 'unit_number']].head(50)

Unnamed: 0,token,full_mention,unit_number
0,EU,EU,0
1,rejects,,0
2,German,German,0
3,call,,0
4,to,,0
5,boycott,,0
6,British,British,0
7,lamb,,0
8,.,,0
9,,,1


Drop `None` tokens now, as we no longer need them

In [8]:
acy_df = acy_df[acy_df.token.notnull()]
len(acy_df)

164758

Create single column that holds the full sentence for later use

In [9]:
acy_df['full_sentence'] = acy_df.groupby('unit_number')['token'].transform(' '.join)
acy_df.head(3)

Unnamed: 0,token,mention,full_mention,YAGO2,wikipedia_URL,wikipedia_ID,freebase,unit_number,full_sentence
0,EU,B,EU,--NME--,,,,0,EU rejects German call to boycott British lamb .
1,rejects,,,,,,,0,EU rejects German call to boycott British lamb .
2,German,B,German,Germany,http://en.wikipedia.org/wiki/Germany,11867.0,/m/0345h,0,EU rejects German call to boycott British lamb .


Function to spread rolling window of trailing and leading $W$ mentions and their linked entities *within* the same unit of text. I.e. the first mention in sentence two does not include the last mention in sentence one as its trailing mention (rather, its trailing is `None`). 

In [10]:
def spread_unit_neighbours(raw_df, window_size=1):
    
    df = raw_df.copy()

    for w in range(window_size, 0, -1):
        df['m-'+str(w)] = None
        df.loc[df.full_mention.notnull(), 'm-'+str(w)] = df.loc[df.full_mention.notnull(), 'full_mention'].groupby(df['unit_number']).shift(+w)
        df['e-'+str(w)] = None
        df.loc[df.full_mention.notnull(), 'e-'+str(w)] = df.loc[df.full_mention.notnull(), 'wikipedia_ID'].groupby(df['unit_number']).shift(+w)
    for w in range(1, window_size+1):
        df['m+'+str(w)] = None
        df.loc[df.full_mention.notnull(), 'm+'+str(w)] = df.loc[df.full_mention.notnull(), 'full_mention'].groupby(df['unit_number']).shift(-w)
        df['e+'+str(w)] = None
        df.loc[df.full_mention.notnull(), 'e+'+str(w)] = df.loc[df.full_mention.notnull(), 'wikipedia_ID'].groupby(df['unit_number']).shift(-w) 
        
    drop_columns = ['mention', 'YAGO2', 'wikipedia_URL', 'freebase']
    df.drop(drop_columns, axis=1, inplace=True)
    
    return df

In [11]:
acy_df_neighbours = spread_unit_neighbours(acy_df, 2)
acy_df_neighbours.head(50)

Unnamed: 0,token,full_mention,wikipedia_ID,unit_number,full_sentence,m-2,e-2,m-1,e-1,m+1,e+1,m+2,e+2
0,EU,EU,,0,EU rejects German call to boycott British lamb .,,,,,German,11867.0,British,31717.0
1,rejects,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
2,German,German,11867.0,0,EU rejects German call to boycott British lamb .,,,EU,,British,31717.0,,
3,call,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
4,to,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
5,boycott,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
6,British,British,31717.0,0,EU rejects German call to boycott British lamb .,EU,,German,11867.0,,,,
7,lamb,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
8,.,,,0,EU rejects German call to boycott British lamb .,,,,,,,,
10,Peter,Peter Blackburn,,1,Peter Blackburn,,,,,Peter Blackburn,,,
