In [1]:
base_dataset_path = '../../datasets/ren_et_al/'

dataset_name_path = 'figer/'

types_list_path = 'all_types.txt'

train_path = 'train.json'
dev_path = 'dev.json'
test_path = 'test.json'

dir_for_partitioned_datasets = '../../datasets/datasets_for_incremental_training/'

In [2]:
with open(base_dataset_path + dataset_name_path + types_list_path, 'r') as inp:
    types = inp.readlines()
    types = [l.replace('\n', '') for l in types]
types

['/art',
 '/art/film',
 '/astral_body',
 '/award',
 '/biology',
 '/body_part',
 '/broadcast',
 '/broadcast/tv_channel',
 '/broadcast_network',
 '/broadcast_program',
 '/building',
 '/building/airport',
 '/building/dam',
 '/building/hospital',
 '/building/hotel',
 '/building/library',
 '/building/power_station',
 '/building/restaurant',
 '/building/sports_facility',
 '/building/theater',
 '/chemistry',
 '/computer',
 '/computer/algorithm',
 '/computer/programming_language',
 '/disease',
 '/education',
 '/education/department',
 '/education/educational_degree',
 '/event',
 '/event/attack',
 '/event/election',
 '/event/military_conflict',
 '/event/natural_disaster',
 '/event/protest',
 '/event/sports_event',
 '/event/terrorist_attack',
 '/finance',
 '/finance/currency',
 '/finance/stock_exchange',
 '/food',
 '/game',
 '/geography',
 '/geography/glacier',
 '/geography/island',
 '/geography/mountain',
 '/god',
 '/government',
 '/government/government',
 '/government/political_party',
 '/gov

In [19]:
from networkx import DiGraph

graph = DiGraph()

for t in types:
    graph.add_node(t)

for n in graph.nodes:
    for n2 in graph:
        if n + '/' in n2 and n2 not in n:
            graph.add_edge(n, n2)

In [27]:
list(graph.successors('/broadcast'))

['/broadcast/tv_channel']

In [28]:
list(graph.predecessors('/art'))

[]

In [29]:
print('Number of types: {}'.format(len(graph.nodes)))
print('Number of fathers: {}'.format(sum([1 if len(list(graph.successors(n))) else 0 for n in graph.nodes])))
print('Number of sons: {}'.format(sum([1 if len(list(graph.predecessors(n))) else 0 for n in graph.nodes])))
print('Number of isolated: {}'.format(sum([1 if not len(list(graph.predecessors(n))) and not len(list(graph.successors(n))) else 0 for n in graph.nodes])))

Number of types: 127
Number of fathers: 22
Number of sons: 79
Number of isolated: 26


# Count the examples by type

In [110]:
import json
with open(base_dataset_path + dataset_name_path + dev_path, 'r') as inp:
    lines = [json.loads(l) for l in inp.readlines()]

In [111]:
from collections import defaultdict
from tqdm import tqdm

def print_counter_table(type_name):
    counter = defaultdict(int)
    for l in tqdm(lines):
        if type_name in l['y_str']:
            counter[type_name] += 1
            for t in l['y_str']:
                if type_name + '/' in t and type_name != t:
                    counter[t] += 1
            if not [t for t in l['y_str'] if type_name + '/' in t and type_name != t ]:
                counter['-'] += 1
    
    print('|{:^20}|{:^20}|{:^20}|'.format('Type', 'Absolute', 'Relative'))
    print('|{:-^20}|{:-^20}|{:-^20}|'.format('', '', ''))
    print('|{:^20}|{:^20}|{:^20.4f}|'.format('-', counter['-'], counter['-']/len(lines)))
    for t, total in counter.items():
        if t not in ['-', type_name]:
            print('|{:^20}|{:^20}|{:^20.4f}|'.format(t, counter[t], counter[t]/len(lines)))
    print('|{:^20}|{:^20}|{:^20.4f}|'.format(type_name, counter[type_name], counter[type_name]/len(lines)))
    

def print_counter_tsv(type_name):
    counter = defaultdict(int)
    for l in lines:
        if type_name in l['y_str']:
            counter[type_name] += 1
            for t in l['y_str']:
                if type_name + '/' in t and type_name != t:
                    counter[t] += 1
            if not [t for t in l['y_str'] if type_name + '/' in t and type_name != t ]:
                counter['-'] += 1
    
    
    print('{:^20}\t{:^20}\t{:^20.4f}'.format('-', counter['-'], counter['-']/len(lines)))
    for t, total in counter.items():
        if t not in ['-', type_name]:
            print('{:^20}\t{:^20}\t{:^20.4f}'.format(t, counter[t], counter[t]/len(lines)))
    print('{:^20}\t{:^20}\t{:^20.4f}'.format('TOTAL', counter[type_name], counter[type_name]/len(lines)))
    
        

In [112]:
len(lines)

1094

In [113]:
lines[0]

{'left_context_token': ['"',
  'Jihadism',
  '"',
  'in',
  'this',
  'sense',
  'covers',
  'both',
  'Mujahideen',
  'guerilla',
  'warfare',
  'and',
  'Islamic',
  'terrorism',
  'with',
  'an',
  'international',
  'scope',
  'as',
  'it',
  'arose',
  'from',
  'the',
  '1980s',
  ',',
  'since',
  'the',
  '1990s',
  'substantially',
  'represented',
  'by',
  'the'],
 'mention_span': 'al-Qaeda',
 'right_context_token': ['network', '.'],
 'y_str': ['/government',
  '/government/political_party',
  '/organization',
  '/organization/terrorist_organization']}

In [114]:
type_name = '/art'

In [115]:
print_counter_table(type_name)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1094/1094 [00:00<00:00, 626169.29it/s]

|        Type        |      Absolute      |      Relative      |
|--------------------|--------------------|--------------------|
|         -          |         1          |       0.0009       |
|     /art/film      |         8          |       0.0073       |
|        /art        |         9          |       0.0082       |





In [116]:
print_counter_tsv(type_name)

         -          	         1          	       0.0009       
     /art/film      	         8          	       0.0073       
       TOTAL        	         9          	       0.0082       


In [117]:
fathers = [n for n in graph.nodes if len(list(graph.successors(n)))]
for f in fathers:
    print_counter_tsv(f)

         -          	         1          	       0.0009       
     /art/film      	         8          	       0.0073       
       TOTAL        	         9          	       0.0082       
         -          	         0          	       0.0000       
/broadcast/tv_channel	         1          	       0.0009       
       TOTAL        	         1          	       0.0009       
         -          	         30         	       0.0274       
 /building/theater  	         2          	       0.0018       
/building/sports_facility	         6          	       0.0055       
/building/restaurant	         1          	       0.0009       
 /building/hospital 	         1          	       0.0009       
 /building/airport  	         1          	       0.0009       
  /building/hotel   	         1          	       0.0009       
       TOTAL        	         42         	       0.0384       
         -          	         0          	       0.0000       
       TOTAL        	         0          	       

# select the type

In [3]:
# select by father: create datasets with/withouth each son of the inputed father