In [22]:
import json
from tqdm import tqdm
from collections import defaultdict, Counter, OrderedDict

class OrderedCounter(Counter, OrderedDict):
    pass

In [2]:
def filter_dataset(dataset, mapping):
    mapped_dataset = []
    unmapped_dataset = []
    
    for d in tqdm(dataset):
        only_mapped = True
        new_types = []
        non_mapped_types = []
        
        for t in d['y_str']:
            if mapping[t]:
                for ty in mapping[t]:
                    new_types.append(ty)
            else:
                only_mapped = False
                non_mapped_types.append(t)
        
        if new_types:
            mapped_d = {k:v for k, v  in d.items() if k != 'y_str'}
            mapped_d['y_str'] = [t for t in new_types]
            mapped_d['original_types'] = [t for t in new_types]
            mapped_d['original_types'].extend(non_mapped_types)
            
            mapped_dataset.append(mapped_d)
        
        if not only_mapped:
            unmapped_d = {k:v for k, v  in d.items() if k != 'y_str'}
            unmapped_d['y_str'] = [t for t in new_types]
            unmapped_d['y_str'].extend(non_mapped_types)
            unmapped_dataset.append(unmapped_d)
    return mapped_dataset, unmapped_dataset
            

In [3]:
def types_stats(dataset):
    unique_types = set()
    total_types = 0
    for d in tqdm(dataset):
        unique_types = unique_types.union(set(d['y_str']))
        total_types += len(d['y_str'])
    return unique_types, total_types / len(dataset)

In [4]:
def partially_translate_dataset(dataset, trans_dict):
    new_dataset = []
    for d in tqdm(dataset):
        new_types = []
        for t in d['y_str']:
            for new_t in trans_dict[t]:
                new_types.append(new_t)
        new_dataset.append({k: v for k, v in d.items() if k != 'y_str'})
        new_dataset[-1]['y_str'] = [t for t in new_types]
    return new_dataset

# Onto

In [5]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_train_tree.json', 'r') as inp:
    train_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_dev_tree.json', 'r') as inp:
    dev_lines = [json.loads(t) for t in inp.readlines()]
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/ontonotes/g_test_tree_lines.json', 'r') as inp:
    test_lines = [json.loads(t) for t in inp.readlines()]

In [6]:
from collections import defaultdict

ontonotes_mappings = {'FIGER': defaultdict(list), 'choi': defaultdict(list), 'BBN': defaultdict(list)}

with open('/datahdd/vmanuel/entity_typing_all_datasets/data/entity_typing_original_datasets/OntoNotes_mappings.csv', 'r') as inp:
    lines = [l.replace('\n', '') for l in inp.readlines()]
    for l in lines[1:]:
        splitted = l.split(',')
        if splitted[1] != '-':
            ontonotes_mappings['BBN'][splitted[0]].append(splitted[1])
        if splitted[2] != '-':
            ontonotes_mappings['FIGER'][splitted[0]].append(splitted[2])
        if splitted[3] != '-':
            ontonotes_mappings['choi'][splitted[0]].append(splitted[3])

In [7]:
with open('/datahdd/vmanuel/entity_typing_all_datasets/data/ontology/onto_ontology.txt', 'r') as inp:
    types = set([l.replace('\n', '') for l in inp.readlines()])

# Onto Filtered with BBN

In [8]:
mapping = ontonotes_mappings['BBN']

In [9]:
partially_translation_dict = defaultdict(list)

for t in types:
    if mapping[t]:
        for i, m in enumerate(mapping[t]):
            partially_translation_dict[t].append(m)
    else:
        partially_translation_dict[t].append(t)

In [10]:
partially_translated_dataset = partially_translate_dataset(train_lines, partially_translation_dict)

100%|██████████| 251039/251039 [00:01<00:00, 148868.01it/s]


In [11]:
mapped_train, unmapped_train = filter_dataset(train_lines, mapping)

100%|██████████| 251039/251039 [00:03<00:00, 75771.21it/s] 


In [12]:
original_types, original_txe = types_stats(train_lines)
translated_types, translated_txe = types_stats(partially_translated_dataset)
mapped_types, mapped_txe = types_stats(mapped_train)
unmapped_types, unmapped_txe = types_stats(unmapped_train)

100%|██████████| 251039/251039 [00:00<00:00, 381703.57it/s]
100%|██████████| 251039/251039 [00:00<00:00, 496034.48it/s]
100%|██████████| 229362/229362 [00:00<00:00, 652599.54it/s]
100%|██████████| 180298/180298 [00:00<00:00, 341603.44it/s]


In [13]:
list_of_mapped_types = [t for t in types if mapping[t]]

In [14]:
print('mapped types from Onto to BBN: {} on {}, {:.2f}'.format(len(list_of_mapped_types), 
                                                               len(types),
                                                               len(list_of_mapped_types)/len(original_types)))

mapped types from Onto to BBN: 31 on 89, 0.35


In [15]:
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('onto -> bbn', 'original dataset', 'original traduced dataset', 'mapped dataset', 'unmapped dataset'))
print('{:-^12}-{:-^25}-{:-^25}-{:-^25}-{:-^25}'.format('', '', '', '', ''))
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('mentions', len(train_lines), len(partially_translated_dataset), len(mapped_train), len(unmapped_train)))
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('types', len(original_types), len(translated_types), len(mapped_types), len(unmapped_types)))
print('{:^12}|{:^25.2f}|{:^25.2f}|{:^25.2f}|{:^25.2f}'.format('avg_types', original_txe, translated_txe, mapped_txe, unmapped_txe))

onto -> bbn |    original dataset     |original traduced dataset|     mapped dataset      |    unmapped dataset     
--------------------------------------------------------------------------------------------------------------------
  mentions  |         251039          |         251039          |         229362          |         180298          
   types    |           89            |           87            |           29            |           87            
 avg_types  |          2.79           |          2.79           |          1.55           |          3.19           


## compute statistics on unmapped dataset

In [16]:
missing_types = translated_types.difference(unmapped_types)

In [18]:
"missing types in unmapped dataset: {} on {}".format(len(missing_types), len(translated_types))

'missing types in unmapped dataset: 0 on 87'

In [33]:
all_types = [t for u in unmapped_train for t in u['y_str']]
unmapped_c = dict(sorted(Counter(all_types).items(), key = lambda x:x[1], reverse=True))

all_types = [t for u in partially_translated_dataset for t in u['y_str']]
translated_c = dict(sorted(Counter(all_types).items(), key = lambda x:x[1], reverse=True))

In [37]:
print('{:>40}: {:^15} | {:^15}'.format('Class', 'unmapped occ', 'total occ'))
print('{:->40}---{:-^15}---{:-^15}'.format('', '', ''))
for k, v in c.items():
    print('{:>40}: {:>15} | {:<15}'.format(k, v, translated_c[k]))

                                   Class:  unmapped occ   |    total occ   
----------------------------------------------------------------------------
                                  /other:           92208 | 92208          
                                 /PERSON:           83228 | 89531          
                           /person/title:           37970 | 37970          
                          /person/artist:           35496 | 35496          
                   /person/artist/author:           28998 | 28998          
                           /ORGANIZATION:           23539 | 44070          
                /person/political_figure:           21455 | 21455          
                            /WORK_OF_ART:           17635 | 17635          
                    /person/artist/actor:           16404 | 16404          
                               /LOCATION:           12407 | 60583          
               /ORGANIZATION/CORPORATION:           11792 | 21680          
           

# Onto Filtered with FIGER

In [48]:
mapping = ontonotes_mappings['FIGER']

In [49]:
partially_translation_dict = defaultdict(list)

for t in types:
    if mapping[t]:
        for i, m in enumerate(mapping[t]):
            partially_translation_dict[t].append(m)
    else:
        partially_translation_dict[t].append(t)

In [50]:
partially_translated_dataset = partially_translate_dataset(train_lines, partially_translation_dict)

100%|██████████| 251039/251039 [00:02<00:00, 101245.77it/s]


In [51]:
mapped_train, unmapped_train = filter_dataset(train_lines, mapping, partially_translation_dict)

100%|██████████| 251039/251039 [00:02<00:00, 99606.12it/s] 


In [52]:
original_types, original_txe = types_stats(train_lines)
translated_types, translated_txe = types_stats(partially_translated_dataset)
mapped_types, mapped_txe = types_stats(mapped_train)
unmapped_types, unmapped_txe = types_stats(unmapped_train)

100%|██████████| 251039/251039 [00:00<00:00, 400177.06it/s]
100%|██████████| 251039/251039 [00:00<00:00, 482347.79it/s]
100%|██████████| 243909/243909 [00:00<00:00, 488785.95it/s]
100%|██████████| 107317/107317 [00:00<00:00, 298855.17it/s]


In [53]:
list_of_mapped_types = [t for t in types if mapping[t]]

In [54]:
print('mapped types from Onto to BBN: {} on {}, {:.2f}'.format(len(list_of_mapped_types), 
                                                               len(types),
                                                               len(list_of_mapped_types)/len(original_types)))

mapped types from Onto to BBN: 75 on 89, 0.84


In [55]:
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('onto -> bbn', 'original dataset', 'original traduced dataset', 'mapped dataset', 'unmapped dataset'))
print('{:-^12}-{:-^25}-{:-^25}-{:-^25}-{:-^25}'.format('', '', '', '', ''))
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('mentions', len(train_lines), len(partially_translated_dataset), len(mapped_train), len(unmapped_train)))
print('{:^12}|{:^25}|{:^25}|{:^25}|{:^25}'.format('types', len(original_types), len(translated_types), len(mapped_types), len(unmapped_types)))
print('{:^12}|{:^25.2f}|{:^25.2f}|{:^25.2f}|{:^25.2f}'.format('avg_types', original_txe, translated_txe, mapped_txe, unmapped_txe))

onto -> bbn |    original dataset     |original traduced dataset|     mapped dataset      |    unmapped dataset     
--------------------------------------------------------------------------------------------------------------------
  mentions  |         251039          |         251039          |         243909          |         107317          
   types    |           89            |           88            |           74            |           80            
 avg_types  |          2.79           |          2.89           |          2.40           |          3.36           


In [69]:
mapped_train[0]

{'annot_id': '2',
 'left_context_token': ['Usually'],
 'mention_span': 'directors',
 'mention_span_tree': [{'dep': 'ROOT',
   'is_alpha': True,
   'is_stop': False,
   'lemma': 'director',
   'pos': 'NOUN',
   'shape': 'xxxx',
   'tag': 'NNS',
   'text': 'directors'}],
 'original_types': ['/PERSON', '/person/title'],
 'right_context_token': [',',
  'otherwise',
  ',',
  'they',
  'have',
  'beards',
  'and',
  'very',
  'long',
  'hair',
  ',',
  'or',
  'otherwise',
  'they',
  'shave',
  'their',
  'heads',
  '.'],
 'y': [4, 1],
 'y_str': ['/PERSON'],
 'y_type': [0, 0],
 'y_type_str': ['KB', 'KB']}