# Named Entity Disambiguation
## 1. Preparation
First we import everything that we need.

In [1]:
# !pip install -U numpy
import numpy as np
import math
import pandas as pd
import os
import spacy



We then also define all file-names for input and output

In [2]:
# directories
MAIN_DIR = '/kaggle'
INPUT_DIR = MAIN_DIR + '/input'
WORKING_DIR = MAIN_DIR + '/working'
DATA_DIR = INPUT_DIR + '/dis-project-3-named-entity-disambiguation'
WIKI_LITE = DATA_DIR + '/wiki_lite'

# general input files
TEST_DATA = DATA_DIR + '/test.csv'
TRAIN_DATA = DATA_DIR + '/train.csv'
SAMPLE_SUBMISSION = DATA_DIR + '/sample_submission.csv'

PREPROCESSED_LABELS = WORKING_DIR + '/preprocessed_labels.txt'

# wiki-lite input files
REDIRECTS = WIKI_LITE + '/enwiki_redirects.tsv'
ALIASES = WIKI_LITE + '/item_aliases.csv'
PROPERTIES = WIKI_LITE + '/property.csv'
STATEMENTS = WIKI_LITE + '/statements.csv'
ITEMS = WIKI_LITE + '/wiki_items.csv'

# OUTPUT FILES
EMBEDDINGS = WORKING_DIR + '/embeddings.csv'
SUBMISSION = WORKING_DIR + '/submission.csv'

Next, we read all data from the input files and parse it into a usable format.

In [3]:
%%time
# read raw inputs
testdata_raw = pd.read_csv(TEST_DATA)
traindata_raw = pd.read_csv(TRAIN_DATA)
samplesubmission_raw = pd.read_csv(SAMPLE_SUBMISSION)
redirect_raw = pd.read_csv(REDIRECTS, sep='\t')
alias_raw = pd.read_csv(ALIASES)
properties_raw = pd.read_csv(PROPERTIES)
statements_raw = pd.read_csv(STATEMENTS)
items_raw = pd.read_csv(ITEMS)

# Note: takes around 55s

CPU times: user 28.8 s, sys: 2.73 s, total: 31.5 s
Wall time: 45.6 s


In [4]:
%%time
# parse inputs into better formats
redirect_data = dict()
for k, v in redirect_raw.values.tolist():
    redirect_data[k] = v

alias_data = dict()
for idx, name in alias_raw.values.tolist():
    name = str(name).lower()
    if name not in alias_data:
        alias_data[name] = []
    alias_data[name].append(idx)

item_data = dict()
for idx, label, desc, title in items_raw.values.tolist():
    # add to item data
    item_data[idx] = {'label': str(label), 'title': str(title), 'description': str(desc)}
    
    # add label alias
    name = str(label).lower()
    if name not in alias_data:
        alias_data[name] = []
    alias_data[name].append(idx)
    
    # add title alias
    name = str(title).lower()
    if name not in alias_data:
        alias_data[name] = []
    alias_data[name] = [idx] + alias_data[name]

# Note: takes around 54s

CPU times: user 42.4 s, sys: 3 s, total: 45.4 s
Wall time: 45.3 s


## 3. Testing & Creating Submission File

---

### Idea 2 - with Train Data as input

In [5]:
%%time
# Aditionally reads and processes the train data
# It is then used with a higher priority (ie. if a mention is in the train data, 
#  then choses it over a random Title

from tqdm import tqdm
from tqdm.auto import tqdm # For the progress bar

mention_to_url = {}
for _, __, ___, full_mention, wiki_url in tqdm(traindata_raw.values.tolist()):
    if isinstance(full_mention, float) or isinstance(wiki_url, float):
        continue
    mention_to_url[full_mention.lower()] = wiki_url

print(len(mention_to_url))
# _ = [print(key, ':', value) for idx, (key, value) in enumerate(mention_to_url.items()) if idx < 100]

  0%|          | 0/218505 [00:00<?, ?it/s]

7550
CPU times: user 296 ms, sys: 21.1 ms, total: 317 ms
Wall time: 314 ms


In [6]:
%%time
def redirect(s):
    if s in redirect_data:
        return redirect_data[s]
    return s

def wiki_url_from_title(s):
    return 'http://en.wikipedia.org/wiki/' + s.replace(' ', '_')


def getUrl(token, mention):

    # Checks the train data
    if mention.lower() in mention_to_url:
        return mention_to_url[mention.lower()]
    
    possible_indices = []
    if mention.lower() in alias_data:
        possible_indices = alias_data[mention.lower()]

    if len(possible_indices) == 0:
        return "NOT_FOUND"

    titles = sorted([item_data[idx]['title'] for idx in possible_indices], key=len)
    
    # TODO: which index is "the best"?
    title = titles[0]
    
    title = redirect(title)
    return wiki_url_from_title(title)

submission = []
for idx, token, tag, mention, url in testdata_raw.values.tolist():
    if url == '?':
        submission.append((idx, getUrl(token, mention)))
    else:
        submission.append((idx, "NOT_FOUND"))
        
print(f"{len(testdata_raw.values) = }")

len(testdata_raw.values) = 104890
CPU times: user 124 ms, sys: 11.9 ms, total: 136 ms
Wall time: 135 ms


In [7]:
# create submission
def generate_submission(submission_data) -> None:
    f = open(SUBMISSION, 'w')
    f.write("id,wiki_url\n")
    for idx, url in submission_data:
        f.write(f"{idx},\"{url}\"\n")
    f.close()
generate_submission(submission)