In [1]:
import json
from tqdm import tqdm

In [2]:
def filter_label(original_label, mapping_dict):
    if mapping_dict[original_label]:
        return [original_label]
    else:
        return []
def filter_all_labels(original_labels, mapping_dict):
    new_labels = []
    for t in original_labels:
        new_labels.extend(filter_label(t, mapping_dict))
    return new_labels

In [8]:
def filter_data(data, mapping):
    N = 3
    i = 0
    new_data = []
    for l in tqdm(data):
        new_types = [ty for t in l['y_str'] for ty in mapping[t] if t in mapping]
        new_line = {k:v for k, v in l.items() if k != 'y_str'}
        new_line['y_str'] = new_types
        new_line['original_types'] = l['y_str']
        new_line['original_types_only_mapped'] = filter_all_labels(l['y_str'], mapping)
        if len(new_types) > 0:
            new_data.append(new_line)
            if i < N:
                print('{}, {}'.format(new_types, l['y_str']))
                i +=1
    return new_data

# Filter BBN

In [5]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/BBN/BBN/train_partitioned.json', 'r') as inp:
    train_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/BBN/BBN/dev_partitioned.json', 'r') as inp:
    dev_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/BBN/BBN/test_tree.json', 'r') as inp:
    test_lines = [json.loads(t) for t in inp.readlines()]

In [6]:
from collections import defaultdict

bbn_mappings = {'FIGER': defaultdict(list), 'choi': defaultdict(list), 'OntoNotes': defaultdict(list)}

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/BBN_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[1] != '-':
            bbn_mappings['OntoNotes'][splitted[0]].append(splitted[1])
        if splitted[2] != '-':
            bbn_mappings['FIGER'][splitted[0]].append(splitted[2])
        if splitted[3] != '-':
            bbn_mappings['choi'][splitted[0]].append(splitted[3])

In [7]:
bbn_mappings

{'FIGER': defaultdict(list,
             {'/ANIMAL': ['/livingthing/animal'],
              '/DISEASE': ['/disease'],
              '/EVENT': ['/event'],
              '/EVENT/HURRICANE': ['/event/natural_disaster'],
              '/EVENT/WAR': ['/event/military_conflict'],
              '/FACILITY': ['/location'],
              '/FACILITY/AIRPORT': ['/building/airport'],
              '/FACILITY/BRIDGE': ['/location/bridge'],
              '/FACILITY/BUILDING': ['/building'],
              '/FACILITY/HIGHWAY_STREET': ['/transportation/road'],
              '/GAME': ['/game'],
              '/GPE/CITY': ['/location/city'],
              '/GPE/COUNTRY': ['/location/country'],
              '/GPE/STATE_PROVINCE': ['/location/province'],
              '/LANGUAGE': ['/language'],
              '/LAW': ['/law'],
              '/LOCATION': ['/location'],
              '/LOCATION/LAKE_SEA_OCEAN': ['/location/body_of_water'],
              '/LOCATION/RIVER': ['/location/body_of_water'],
      

## BBN into FIGER

In [8]:
train_bbn_into_figer = filter_data(data=train_lines, mapping=bbn_mappings['FIGER'])

['/livingthing', '/living_thing', '/product', '/food'], ['/PLANT', '/PRODUCT', '/SUBSTANCE', '/SUBSTANCE/FOOD']
['/organization/company', '/art', '/organization', '/written_work'], ['/ORGANIZATION/CORPORATION', '/WORK_OF_ART', '/ORGANIZATION', '/WORK_OF_ART/BOOK']
['/disease'], ['/DISEASE']


In [10]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_BBN_into_FIGER.json', 'w') as out:
    for l in tqdm(train_bbn_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 79340/79340 [00:06<00:00, 12038.08it/s]


In [11]:
dev_bbn_into_figer = filter_data(data=dev_lines, mapping=bbn_mappings['FIGER'])

['/person'], ['/PERSON']
['/location/country', '/location/city', '/location'], ['/GPE/COUNTRY', '/GPE/CITY', '/GPE', '/LOCATION']
['/organization/company', '/organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [12]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_BBN_into_FIGER.json', 'w') as out:
    for l in tqdm(dev_bbn_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1615/1615 [00:00<00:00, 13449.50it/s]


In [13]:
test_bbn_into_figer = filter_data(data=test_lines, mapping=bbn_mappings['FIGER'])

['/person'], ['/PERSON']
['/person'], ['/PERSON']
['/organization/company', '/organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [14]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_BBN_into_FIGER.json', 'w') as out:
    for l in tqdm(test_bbn_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 11811/11811 [00:00<00:00, 12409.79it/s]


In [15]:
print('BBN into FIGER')
print("original_dev: {}, original_test: {}, mapped_dev:{}, mapped_test:{}".format(len(dev_lines), 
                                                                                    len(test_lines),
                                                                                    len(dev_bbn_into_figer),
                                                                                    len(test_bbn_into_figer)))

BBN into FIGER
original_dev: 1721, original_test: 12349, mapped_dev:1615, mapped_test:11811


In [23]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_BBN_into_FIGER.json', 'w') as out:
    for l in new_test:
        json.dump(l, out)
        out.write('\n')

## BBN into OntoNotes

In [16]:
train_bbn_into_onto = filter_data(data=train_lines, mapping=bbn_mappings['OntoNotes'])

['/other/living_thing', '/other/product', '/other/food'], ['/PLANT', '/PRODUCT', '/SUBSTANCE', '/SUBSTANCE/FOOD']
['/organization/company', '/other/art', '/organization', '/other/art/writing'], ['/ORGANIZATION/CORPORATION', '/WORK_OF_ART', '/ORGANIZATION', '/WORK_OF_ART/BOOK']
['/other/health/malady'], ['/DISEASE']


In [17]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_BBN_into_Ontonotes.json', 'w') as out:
    for l in tqdm(train_bbn_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 80877/80877 [00:06<00:00, 13317.06it/s]


In [18]:
dev_bbn_into_onto = filter_data(data=dev_lines, mapping=bbn_mappings['OntoNotes'])

['/person'], ['/PERSON']
['/location/country', '/location/city', '/location'], ['/GPE/COUNTRY', '/GPE/CITY', '/GPE', '/LOCATION']
['/organization/company', '/organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [19]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_BBN_into_Ontonotes.json', 'w') as out:
    for l in tqdm(dev_bbn_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1645/1645 [00:00<00:00, 12278.95it/s]


In [21]:
test_bbn_into_onto = filter_data(data=test_lines, mapping=bbn_mappings['OntoNotes'])

['/person'], ['/PERSON']
['/person'], ['/PERSON']
['/organization/company', '/organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [22]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_BBN_into_Ontonotes.json', 'w') as out:
    for l in tqdm(test_bbn_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 11483/11483 [00:00<00:00, 12314.74it/s]


In [15]:
print('BBN into FIGER')
print("original_dev: {}, original_test: {}, mapped_dev:{}, mapped_test:{}".format(len(dev_lines), 
                                                                                    len(test_lines),
                                                                                    len(dev_bbn_into_figer),
                                                                                    len(test_bbn_into_figer)))

BBN into FIGER
original_dev: 1721, original_test: 12349, mapped_dev:1615, mapped_test:11811


## BBN into Choi

In [23]:
train_bbn_into_choi = filter_data(data=train_lines, mapping=bbn_mappings['choi'])

['plant', 'product', 'substance', 'food'], ['/PLANT', '/PRODUCT', '/SUBSTANCE', '/SUBSTANCE/FOOD']
['corporation', 'organization', 'book'], ['/ORGANIZATION/CORPORATION', '/WORK_OF_ART', '/ORGANIZATION', '/WORK_OF_ART/BOOK']
['disease'], ['/DISEASE']


In [24]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_BBN_into_choi.json', 'w') as out:
    for l in tqdm(train_bbn_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 83198/83198 [00:06<00:00, 13574.45it/s]


In [25]:
dev_bbn_into_choi = filter_data(data=dev_lines, mapping=bbn_mappings['choi'])

['person'], ['/PERSON']
['country', 'city', 'location'], ['/GPE/COUNTRY', '/GPE/CITY', '/GPE', '/LOCATION']
['corporation', 'organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [26]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_BBN_into_choi.json', 'w') as out:
    for l in tqdm(dev_bbn_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1699/1699 [00:00<00:00, 12536.63it/s]


In [28]:
test_bbn_into_choi = filter_data(data=test_lines, mapping=bbn_mappings['choi'])

['person'], ['/PERSON']
['person'], ['/PERSON']
['corporation', 'organization'], ['/ORGANIZATION/CORPORATION', '/ORGANIZATION']


In [29]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_BBN_into_choi.json', 'w') as out:
    for l in tqdm(test_bbn_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 12156/12156 [00:00<00:00, 12407.37it/s]


In [14]:
print('BBN into choi')
print("original_dev: {}, original_test: {}, mapped_dev:{}, mapped_test:{}".format(len(dev_lines), 
                                                                                    len(test_lines),
                                                                                    len(new_dev),
                                                                                    len(new_test)))

BBN into choi
original_dev: 1721, original_test: 12349, mapped_dev:1699, mapped_test:12156


# Filter OntoNotes

In [30]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_train_tree.json', 'r') as inp:
    train_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_dev_tree.json', 'r') as inp:
    dev_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_test_tree_lines.json', 'r') as inp:
    test_lines = [json.loads(t) for t in inp.readlines()]

In [31]:
from collections import defaultdict

ontonotes_mappings = {'FIGER': defaultdict(list), 'choi': defaultdict(list), 'BBN': defaultdict(list)}

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/OntoNotes_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[1] != '-':
            ontonotes_mappings['BBN'][splitted[0]].append(splitted[1])
        if splitted[2] != '-':
            ontonotes_mappings['FIGER'][splitted[0]].append(splitted[2])
        if splitted[3] != '-':
            ontonotes_mappings['choi'][splitted[0]].append(splitted[3])

In [32]:
ontonotes_mappings

{'BBN': defaultdict(list,
             {'/location': ['/LOCATION'],
              '/location/city': ['/GPE/CITY'],
              '/location/country': ['/GPE/COUNTRY'],
              '/location/structure/airport': ['/FACILITY/AIRPORT'],
              '/location/structure/government': ['/ORGANIZATION/GOVERNMENT'],
              '/location/structure/hospital': ['/ORGANIZATION/HOSPITAL'],
              '/location/structure/hotel': ['/ORGANIZATION/HOTEL'],
              '/location/transit/bridge': ['/FACILITY/BRIDGE'],
              '/organization': ['/ORGANIZATION'],
              '/organization/company': ['/ORGANIZATION/CORPORATION'],
              '/organization/education': ['/ORGANIZATION/EDUCATIONAL'],
              '/organization/government': ['/ORGANIZATION/GOVERNMENT'],
              '/organization/political_party': ['/ORGANIZATION/POLITICAL'],
              '/other/art': ['/WORK_OF_ART'],
              '/other/art/film': ['/WORK_OF_ART/PLAY'],
              '/other/art/music': ['/W

## Ontonotes into BBN

In [34]:
train_onto_into_bbn = filter_data(data=train_lines, mapping=ontonotes_mappings['BBN'])

['/PERSON'], ['/person/title', '/person']
['/PERSON'], ['/person/athlete', '/person']
['/PERSON'], ['/person/athlete', '/person']


In [35]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_Onto_into_BBN.json', 'w') as out:
    for l in tqdm(train_onto_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 229362/229362 [00:20<00:00, 11350.70it/s]


In [36]:
dev_onto_into_bbn = filter_data(data=dev_lines, mapping=ontonotes_mappings['BBN'])

['/LOCATION', '/GPE/COUNTRY'], ['/location', '/location/country']
['/LOCATION', '/ORGANIZATION', '/ORGANIZATION/GOVERNMENT'], ['/location', '/location/structure', '/organization', '/organization/government']
['/LOCATION', '/ORGANIZATION'], ['/location', '/location/structure', '/organization']


In [37]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_Onto_into_BBN.json', 'w') as out:
    for l in tqdm(dev_onto_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 994/994 [00:00<00:00, 5265.32it/s]


In [38]:
test_onto_into_bbn = filter_data(data=test_lines, mapping=ontonotes_mappings['BBN'])

['/ORGANIZATION', '/ORGANIZATION/CORPORATION'], ['/organization', '/organization/company']
['/ORGANIZATION', '/ORGANIZATION/CORPORATION'], ['/organization', '/organization/company']
['/ORGANIZATION', '/ORGANIZATION/CORPORATION'], ['/organization', '/organization/company']


In [39]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_Onto_into_BBN.json', 'w') as out:
    for l in tqdm(test_onto_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 4261/4261 [00:00<00:00, 6626.74it/s]


## Ontonotes into FIGER

In [41]:
train_onto_into_figer = filter_data(data=train_lines, mapping=ontonotes_mappings['FIGER'])

['/body_part'], ['/other', '/other/body_part']
['/title', '/person'], ['/person/title', '/person']
['/person/athlete', '/person'], ['/person/athlete', '/person']


In [42]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_Onto_into_figer.json', 'w') as out:
    for l in tqdm(train_onto_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 243909/243909 [00:21<00:00, 11327.95it/s]


In [43]:
dev_onto_into_figer = filter_data(data=dev_lines, mapping=ontonotes_mappings['FIGER'])

['/location', '/location/country'], ['/location', '/location/country']
['/location', '/organization', '/government'], ['/location', '/location/structure', '/organization', '/organization/government']
['/location', '/organization'], ['/location', '/location/structure', '/organization']


In [44]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_Onto_into_figer.json', 'w') as out:
    for l in tqdm(dev_onto_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1052/1052 [00:00<00:00, 4783.65it/s]


In [45]:
test_onto_into_figer = filter_data(data=test_lines, mapping=ontonotes_mappings['FIGER'])

['/organization', '/organization/company'], ['/organization', '/organization/company']
['/organization', '/organization/company'], ['/organization', '/organization/company']
['/organization', '/organization/company'], ['/organization', '/organization/company']


In [46]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_Onto_into_figer.json', 'w') as out:
    for l in tqdm(test_onto_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 4475/4475 [00:00<00:00, 6779.21it/s]


## Onto into Choi

In [47]:
train_onto_into_choi = filter_data(data=train_lines, mapping=ontonotes_mappings['choi'])

['body_part'], ['/other', '/other/body_part']
['title', 'person'], ['/person/title', '/person']
['athlete', 'person'], ['/person/athlete', '/person']


In [48]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_Onto_into_choi.json', 'w') as out:
    for l in tqdm(train_onto_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 249244/249244 [00:21<00:00, 11599.67it/s]


In [49]:
dev_onto_into_choi = filter_data(data=dev_lines, mapping=ontonotes_mappings['choi'])

['location', 'country'], ['/location', '/location/country']
['location', 'structure', 'organization', 'government'], ['/location', '/location/structure', '/organization', '/organization/government']
['location', 'structure', 'organization'], ['/location', '/location/structure', '/organization']


In [50]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_Onto_into_choi.json', 'w') as out:
    for l in tqdm(dev_onto_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1054/1054 [00:00<00:00, 5495.68it/s]


In [51]:
test_onto_into_choi = filter_data(data=test_lines, mapping=ontonotes_mappings['choi'])

['organization', 'company'], ['/organization', '/organization/company']
['organization', 'company'], ['/organization', '/organization/company']
['organization', 'company'], ['/organization', '/organization/company']


In [52]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_Onto_into_choi.json', 'w') as out:
    for l in tqdm(test_onto_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 4480/4480 [00:00<00:00, 6711.63it/s]


# Filter FIGER

In [4]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/FIGER/train_partitioned.json', 'r') as inp:
    train_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/FIGER/dev_partitioned.json', 'r') as inp:
    dev_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/FIGER/test_tree.json', 'r') as inp:
    test_lines = [json.loads(t) for t in inp.readlines()]

In [5]:
from collections import defaultdict

figer_mappings = {'Ontonotes': defaultdict(list), 'choi': defaultdict(list), 'BBN': defaultdict(list)}

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/FIGER_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[1] != '-':
            figer_mappings['BBN'][splitted[0]].append(splitted[1])
        if splitted[2] != '-':
            figer_mappings['Ontonotes'][splitted[0]].append(splitted[2])
        if splitted[3] != '-':
            figer_mappings['choi'][splitted[0]].append(splitted[3])

In [55]:
figer_mappings

{'BBN': defaultdict(list,
             {'/art/film': ['/WORK_OF_ART'],
              '/broadcast': ['/WORK_OF_ART'],
              '/broadcast_program': ['/WORK_OF_ART'],
              '/building': ['/FACILITY/BUILDING'],
              '/building/airport': ['/FACILITY/AIRPORT'],
              '/building/dam': [''],
              '/building/hospital': ['/ORGANIZATION/HOSPITAL'],
              '/disease': ['/DISEASE'],
              '/education': ['/ORGANIZATION/EDUCATIONAL'],
              '/education/department': ['/ORGANIZATION/EDUCATIONAL'],
              '/education/educational_degree': [''],
              '/event': ['/EVENT'],
              '/event/military_conflict': ['/EVENT/WAR'],
              '/food': ['/SUBSTANCE/FOOD'],
              '/game': ['/GAME'],
              '/government': ['/ORGANIZATION/GOVERNMENT'],
              '/language': ['/LANGUAGE'],
              '/law': ['/LAW'],
              '/livingthing/animal': ['/ANIMAL'],
              '/location': ['/LOCATION'],


## Figer into BBN

In [56]:
train_figer_into_bbn = filter_data(data=train_lines, mapping=figer_mappings['BBN'])

['/PERSON'], ['/person/musician', '/person/artist', '/person']
['/PERSON'], ['/person/musician', '/person/artist', '/person']
['/PERSON'], ['/person/musician', '/person/artist', '/person']


In [57]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_figer_into_BBN.json', 'w') as out:
    for l in tqdm(train_figer_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 2495272/2495272 [04:21<00:00, 9533.76it/s] 


In [58]:
dev_figer_into_bbn = filter_data(data=dev_lines, mapping=figer_mappings['BBN'])

['/PERSON'], ['/person/athlete', '/person']
['/PERSON'], ['/person/artist', '/person/author', '/person/musician', '/person']
['/LANGUAGE', '/GPE/COUNTRY', '/LOCATION', '/PERSON'], ['/person/artist', '/location/cemetery', '/language', '/title', '/location/country', '/location', '/government_agency', '/person']


In [59]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_figer_into_BBN.json', 'w') as out:
    for l in tqdm(dev_figer_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 5005/5005 [00:00<00:00, 6281.02it/s]


In [60]:
test_figer_into_bbn = filter_data(data=test_lines, mapping=figer_mappings['BBN'])

['/ORGANIZATION', '/ORGANIZATION/EDUCATIONAL'], ['/organization', '/organization/educational_institution']
['/ORGANIZATION/EDUCATIONAL', '/ORGANIZATION', '/ORGANIZATION/EDUCATIONAL'], ['/education/department', '/organization', '/education']
['/ORGANIZATION'], ['/organization']


In [61]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_figer_into_BBN.json', 'w') as out:
    for l in tqdm(test_figer_into_bbn):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 522/522 [00:00<00:00, 7260.05it/s]


## Figer into onto

In [7]:
train_figer_into_onto = filter_data(data=train_lines, mapping=figer_mappings['Ontonotes'])

['/person/artist/music', '/person/artist', '/person'], ['/person/musician', '/person/artist', '/person']
['/person/artist/music', '/person/artist', '/person'], ['/person/musician', '/person/artist', '/person']
['/person/artist/music', '/person/artist', '/person'], ['/person/musician', '/person/artist', '/person']


In [9]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_figer_into_onto.json', 'w') as out:
    for l in tqdm(train_figer_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 2606770/2606770 [04:03<00:00, 10697.66it/s]


In [63]:
dev_figer_into_onto = filter_data(data=dev_lines, mapping=figer_mappings['Ontonotes'])

['/person/athlete', '/person'], ['/person/athlete', '/person']
['/person/artist', '/person/artist/author', '/person/artist/music', '/person'], ['/person/artist', '/person/author', '/person/musician', '/person']
['/person/artist', '/other/language', '/person/title', '/location/country', '/location', '/person'], ['/person/artist', '/location/cemetery', '/language', '/title', '/location/country', '/location', '/government_agency', '/person']


In [64]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_figer_into_Ontonotes.json', 'w') as out:
    for l in tqdm(dev_figer_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 5214/5214 [00:00<00:00, 8487.29it/s]


In [65]:
test_figer_into_onto = filter_data(data=test_lines, mapping=figer_mappings['Ontonotes'])

['/organization', '/organization/education'], ['/organization', '/organization/educational_institution']
['/organization', '/organization/education'], ['/education/department', '/organization', '/education']
['/organization'], ['/organization']


In [66]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_figer_into_Ontonotes.json', 'w') as out:
    for l in tqdm(test_figer_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 528/528 [00:00<00:00, 8351.41it/s]


## Figer into Choi

In [10]:
train_figer_into_choi = filter_data(data=train_lines, mapping=figer_mappings['choi'])

  1%|          | 24380/2684906 [00:00<00:24, 109728.61it/s]

['musician', 'artist', 'person'], ['/person/musician', '/person/artist', '/person']
['musician', 'artist', 'person'], ['/person/musician', '/person/artist', '/person']
['musician', 'artist', 'person'], ['/person/musician', '/person/artist', '/person']


100%|██████████| 2684906/2684906 [00:31<00:00, 85302.99it/s] 


In [11]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_figer_into_choi.json', 'w') as out:
    for l in tqdm(train_figer_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 2677947/2677947 [04:07<00:00, 10836.33it/s]


In [67]:
dev_figer_into_choi = filter_data(data=dev_lines, mapping=figer_mappings['choi'])

['athlete', 'person'], ['/person/athlete', '/person']
['artist', 'author', 'musician', 'person'], ['/person/artist', '/person/author', '/person/musician', '/person']
['artist', 'cemetery', 'language', 'title', 'country', 'location', 'person'], ['/person/artist', '/location/cemetery', '/language', '/title', '/location/country', '/location', '/government_agency', '/person']


In [68]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_figer_into_choi.json', 'w') as out:
    for l in tqdm(dev_figer_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 5364/5364 [00:00<00:00, 7382.24it/s]


In [69]:
test_figer_into_choi = filter_data(data=test_lines, mapping=figer_mappings['choi'])

['organization'], ['/organization', '/organization/educational_institution']
['organization', 'education'], ['/education/department', '/organization', '/education']
['organization'], ['/organization']


In [70]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_figer_into_choi.json', 'w') as out:
    for l in tqdm(test_figer_into_choi):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 556/556 [00:00<00:00, 9724.67it/s]


# Filter Choi

In [12]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/choi/train.json', 'r') as inp:
    train_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/choi/dev_tree.json', 'r') as inp:
    dev_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/choi/test_tree_lines.json', 'r') as inp:
    test_lines = [json.loads(t) for t in inp.readlines()]

In [13]:
from collections import defaultdict

choi_mappings = {'Ontonotes': defaultdict(list), 'figer': defaultdict(list), 'BBN': defaultdict(list)}

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/BBN_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[3] != '-':
            choi_mappings['BBN'][splitted[3]].append(splitted[0])

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/FIGER_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')            
        if splitted[3] != '-':
            choi_mappings['figer'][splitted[3]].append(splitted[0])

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/OntoNotes_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[3] != '-':
            choi_mappings['Ontonotes'][splitted[3]].append(splitted[0])

In [22]:
choi_mappings

{'Ontonotes': defaultdict(list,
             {'location': ['/location'],
              'celestial_body': ['/location/celestial'],
              'city': ['/location/city'],
              'country': ['/location/country'],
              'geography': ['/location/geography', '/location/geograpy'],
              'park': ['/location/park'],
              'structure': ['/location/structure'],
              'transit': ['/location/transit', '/organization/transit'],
              'body_of_water': ['/location/geography/body_of_water'],
              'island': ['/location/geography/island',
               '/location/geograpy/island'],
              'mountain': ['/location/geography/mountain'],
              'airport': ['/location/structure/airport'],
              'government': ['/location/structure/government',
               '/organization/government'],
              'hospital': ['/location/structure/hospital'],
              'hotel': ['/location/structure/hotel'],
              'restaurant': ['

## Choi into BBN

In [14]:
train_choi_into_BBN = filter_data(data=train_lines, mapping=choi_mappings['BBN'])

  1%|          | 26991/3549962 [00:00<00:25, 137722.93it/s]

['/PERSON'], ['actor', 'artist', 'person']
['/ORGANIZATION'], ['organization', 'league']
['/GPE/COUNTRY', '/ORGANIZATION/GOVERNMENT', '/LOCATION', '/ORGANIZATION/GOVERNMENT'], ['country', 'government', 'location', 'park', 'government', 'agency']


100%|██████████| 3549962/3549962 [00:19<00:00, 179231.95it/s]


In [16]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_choi_into_BBN.json', 'w') as out:
    for l in tqdm(train_choi_into_BBN):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 475182/475182 [00:58<00:00, 8119.70it/s]


In [73]:
dev_choi_into_BBN = filter_data(data=dev_lines, mapping=choi_mappings['BBN'])

['/GPE/COUNTRY', '/LOCATION'], ['country', 'location']
['/PERSON'], ['artist', 'author', 'musician', 'person']
['/ORGANIZATION'], ['organization', 'company']


In [74]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_choi_into_BBN.json', 'w') as out:
    for l in tqdm(dev_choi_into_BBN):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 19831/19831 [00:02<00:00, 8137.87it/s]


In [75]:
test_choi_into_BBN = filter_data(data=test_lines, mapping=choi_mappings['BBN'])

['/EVENT'], ['gathering', 'confluence', 'ceremony', 'conference', 'meeting', 'event']
['/PERSON'], ['serviceman', 'politician', 'statesman', 'policeman', 'official', 'spokesman', 'spokesperson', 'person']
['/ORGANIZATION', '/ORGANIZATION/GOVERNMENT'], ['committee', 'legislature', 'organization', 'administration', 'assembly', 'place', 'government']


In [76]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_choi_into_BBN.json', 'w') as out:
    for l in tqdm(test_choi_into_BBN):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1475/1475 [00:00<00:00, 9318.39it/s]


## Choi into Onto

In [17]:
train_choi_into_Onto = filter_data(data=train_lines, mapping=choi_mappings['Ontonotes'])

  1%|          | 19960/3549962 [00:00<00:36, 97969.99it/s]

['/person/artist/actor', '/person/artist', '/person'], ['actor', 'artist', 'person']
['/organization', '/organization/sports_league'], ['organization', 'league']
['/location/country', '/location/structure/government', '/organization/government', '/location', '/location/park', '/location/structure/government', '/organization/government'], ['country', 'government', 'location', 'park', 'government', 'agency']


100%|██████████| 3549962/3549962 [00:27<00:00, 127552.95it/s]


In [18]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_choi_into_Onto.json', 'w') as out:
    for l in tqdm(train_choi_into_Onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 561162/561162 [01:08<00:00, 8248.49it/s] 


In [77]:
dev_choi_into_onto = filter_data(data=dev_lines, mapping=choi_mappings['Ontonotes'])

['/location/country', '/location'], ['country', 'location']
['/person/artist', '/person/artist/author', '/person'], ['artist', 'author', 'musician', 'person']
['/organization', '/organization/company'], ['organization', 'company']


In [78]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_choi_into_onto.json', 'w') as out:
    for l in tqdm(dev_choi_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 24221/24221 [00:03<00:00, 7983.17it/s]


In [79]:
test_choi_into_onto = filter_data(data=test_lines, mapping=choi_mappings['Ontonotes'])

['/other/event'], ['gathering', 'confluence', 'ceremony', 'conference', 'meeting', 'event']
['/person/political_figure', '/person'], ['serviceman', 'politician', 'statesman', 'policeman', 'official', 'spokesman', 'spokesperson', 'person']
['/organization', '/location/structure/government', '/organization/government'], ['committee', 'legislature', 'organization', 'administration', 'assembly', 'place', 'government']


In [80]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_choi_into_onto.json', 'w') as out:
    for l in tqdm(test_choi_into_onto):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1532/1532 [00:00<00:00, 8740.26it/s]


## Choi into FIGER

In [19]:
train_choi_into_figer = filter_data(data=train_lines, mapping=choi_mappings['figer'])

  1%|          | 22791/3549962 [00:00<00:30, 115232.18it/s]

['/person/actor', '/person/artist', '/person'], ['actor', 'artist', 'person']
['/organization', '/organization/sports_league'], ['organization', 'league']
['/location/country', '/government', '/government/government', '/location', '/park', '/government', '/government/government'], ['country', 'government', 'location', 'park', 'government', 'agency']


100%|██████████| 3549962/3549962 [00:57<00:00, 61482.90it/s] 


In [20]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/train_choi_into_figer.json', 'w') as out:
    for l in tqdm(train_choi_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 667679/667679 [01:21<00:00, 8199.27it/s] 


In [81]:
dev_choi_into_figer = filter_data(data=dev_lines, mapping=choi_mappings['figer'])

['/location/country', '/location'], ['country', 'location']
['/person/artist', '/person/author', '/person/musician', '/person'], ['artist', 'author', 'musician', 'person']
['/organization', '/organization/company'], ['organization', 'company']


In [82]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/dev_choi_into_figer.json', 'w') as out:
    for l in tqdm(dev_choi_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 28386/28386 [00:03<00:00, 7445.05it/s]


In [83]:
test_choi_into_figer = filter_data(data=test_lines, mapping=choi_mappings['figer'])

['/time'], ['date', 'weekday', 'time', 'day']
['/event'], ['gathering', 'confluence', 'ceremony', 'conference', 'meeting', 'event']
['/person/politician', '/person'], ['serviceman', 'politician', 'statesman', 'policeman', 'official', 'spokesman', 'spokesperson', 'person']


In [84]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/mapped_datasets/test_choi_into_figer.json', 'w') as out:
    for l in tqdm(test_choi_into_figer):
        json.dump(l, out)
        out.write('\n')

100%|██████████| 1618/1618 [00:00<00:00, 8991.49it/s]
