In [31]:
! pip install nltk



In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\odaim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
import json
import os
from unidecode import unidecode

from nltk.tokenize import word_tokenize

In [29]:
def normalize_nyt_sample(sample):
    data = json.loads(sample)
    sentence = data['sentText'].replace('Jr.', 'Jr')
    sentence = sentence.replace('U.S.A.', 'U.S.A')
    sentence = sentence.replace('P.M.', 'P.M')
    tokens = word_tokenize(sentence)
    norm = {}
    norm['doc_key'] = data['articleId']
    norm['sentences'] = [tokens]
    norm['ner'] = []
    norm['relations'] = []
    norm['clusters'] = []
    # print(tokens)
    for entity in data['entityMentions']:
        ent = entity['text'].replace('Jr.', 'Jr')
        ent = ent.replace('U.S.A.', 'U.S.A')
        ent = ent.replace('P.M.', 'P.M')
        ner = word_tokenize(ent)
        # print(ner)
        label = entity['label'].title()
        norm['ner'].append([tokens.index(ner[0]), tokens.index(ner[-1]), label])
        
    for relation in data['relationMentions']:
        label = relation['label'].split('/')[-1].replace('_','-').upper()
        source = unidecode(relation['em1Text'].replace('Jr.', 'Jr'))
        source = word_tokenize(source)
        # print(source)
        target = unidecode(relation['em2Text'].replace('Jr.', 'Jr'))
        target = word_tokenize(target)
        # print(target)
        norm['relations'].append([tokens.index(source[0]), tokens.index(source[-1]), tokens.index(target[0]), tokens.index(target[-1]), label])
        
    return norm

In [30]:
samp = """{"sentText": "Was it just last month that Wal-Mart 's chief executive , H. Lee Scott Jr. , said his company would be a kinder , gentler corporate citizen and never again bulldoze a local government to let it open more stores ?", "articleId": "/m/vinci8/data1/riedel/projects/relation/kb/nyt1/docstore/nyt-2005-2006.backup/1674506.xml.pb", "relationMentions": [{"em1Text": "H. Lee Scott Jr.", "em2Text": "Wal-Mart", "label": "/business/person/company"}], "entityMentions": [{"start": 0, "label": "ORGANIZATION", "text": "Wal-Mart"}, {"start": 1, "label": "PERSON", "text": "H. Lee Scott Jr."}], "sentId": "1"}"""
normalize_nyt_sample(samp)

{'doc_key': '/m/vinci8/data1/riedel/projects/relation/kb/nyt1/docstore/nyt-2005-2006.backup/1674506.xml.pb',
 'sentences': [['Was',
   'it',
   'just',
   'last',
   'month',
   'that',
   'Wal-Mart',
   "'s",
   'chief',
   'executive',
   ',',
   'H.',
   'Lee',
   'Scott',
   'Jr',
   ',',
   'said',
   'his',
   'company',
   'would',
   'be',
   'a',
   'kinder',
   ',',
   'gentler',
   'corporate',
   'citizen',
   'and',
   'never',
   'again',
   'bulldoze',
   'a',
   'local',
   'government',
   'to',
   'let',
   'it',
   'open',
   'more',
   'stores',
   '?']],
 'ner': [[6, 6, 'Organization'], [11, 14, 'Person']],
 'relations': [[11, 14, 6, 6, 'COMPANY']],
 'clusters': []}

In [35]:
nyt_data_dir = os.getcwd() + '/other_data/nyt_er_dataset/'

def write_normal_data(in_dir, out_dir):
    with open(in_dir) as f:
        for line in f:
            try:
                maped_sample = normalize_nyt_sample(line)
            except:
                print(line)
                break
            with open(out_dir, 'a') as normalized:
                normalized.write(json.dumps(maped_sample) + "\n")

In [None]:
nyt_train_data_path = nyt_data_dir + 'train.json'
nyt_train_norm_data_path = nyt_data_dir + 'norm_train.json'
            
write_normal_data(nyt_train_data_path, nyt_train_norm_data_path)

In [36]:
nyt_valid_data_path = nyt_data_dir + 'valid.json'
nyt_valid_norm_data_path = nyt_data_dir + 'norm_valid.json'

write_normal_data(nyt_valid_data_path, nyt_valid_norm_data_path)

In [37]:
nyt_test_data_path = nyt_data_dir + 'test.json'
nyt_test_norm_data_path = nyt_data_dir + 'norm_test.json'

write_normal_data(nyt_test_data_path, nyt_test_norm_data_path)