This notebook prepares named entity data used for training translation models. The source and target datasets are identical, so that the models learn that names of places, people etc should be passed through to the output unchanged.

In [93]:
import numpy as np
import os
from IPython import display

In [70]:
!rm -f wikigold.conll.txt wnut17train.conll emerging.test.annotated
!wget https://github.com/juand-r/entity-recognition-datasets/raw/master/data/wikigold/CONLL-format/data/wikigold.conll.txt
!wget https://github.com/juand-r/entity-recognition-datasets/raw/master/data/WNUT17/CONLL-format/data/train/wnut17train.conll
!wget https://github.com/juand-r/entity-recognition-datasets/raw/master/data/WNUT17/CONLL-format/data/test/emerging.test.annotated
display.clear_output()

In [71]:
def file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines
    
lines = (file_to_list('wikigold.conll.txt')
         + file_to_list('wnut17train.conll')
         + file_to_list('emerging.test.annotated')
        )
lines = [l.replace('\t', ' ') for l in lines]

In [76]:
entities = []
current_entity = []

for line in lines:
    if line:
        text, label = line.split(' ')
        if label == 'O':
            if current_entity:
                entities.append(' '.join(current_entity))
            current_entity = []
        else:
            current_entity.append(text)

# De-duplicate
entities = list(set(entities))      

# Randomly lower-case some of the examples
indices = np.random.choice(
    np.arange(len(entities)), int(len(entities) * 0.25), replace=False)
for i in indices:
    entities[i] = entities[i].lower()

print(f'{len(entities)} distinct named entities.')

4713 distinct named entities.


In [112]:
en_mul_dir = 'v7-dataset/v7.0/supervised/en-mul'
mul_en_dir = 'v7-dataset/v7.0/supervised/mul-en'

entities_no_prefix = []
entities_with_prefix = []
for code in language_codes:
    entities_with_prefix += [f'>>{code}<< {text}' for text in entities]
    entities_no_prefix += entities

# mul-en: source and target files are the same.
src_path = os.path.join(mul_en_dir, 'named_entities.src')
tgt_path = os.path.join(mul_en_dir, 'named_entities.tgt')

with open(src_path, "w") as f:
    f.writelines('\n'.join(entities_no_prefix))
with open(tgt_path, "w") as f:
    f.writelines('\n'.join(entities_no_prefix))
    
# en-mul: source files have language prefix codes.
src_path = os.path.join(en_mul_dir, 'named_entities.src')
tgt_path = os.path.join(en_mul_dir, 'named_entities.tgt')

with open(src_path, "w") as f:
    f.writelines('\n'.join(entities_with_prefix))
with open(tgt_path, "w") as f:
    f.writelines('\n'.join(entities_no_prefix))    