In [23]:
import json

def load_dataset(dataset_path):
    with open(dataset_path, 'r') as inp:
        lines = [json.loads(l) for l in inp.readlines()] 
    return lines

def print_lines(lines):
    print('{} lines in the dataset'.format(len(lines)))

def get_type_number(lines):
    types = set()
    
    for l in lines:
        for t in l['y_str']:
            types.add(t)
    print('{} types in this dataset'.format(len(types)))

def get_types(lines):
    types = set()
    
    for l in lines:
        for t in l['y_str']:
            types.add(t)
    return types

def get_exclusive_types(train_data, other):
    
    train_types = get_types(train_data)
    other_types = get_types(other)
    
    print('{} exclusive types in this dataset'.format(len(other_types.difference(train_types))))

from collections import Counter
def get_most_present_type(lines):
    types = []
    
    for l in lines:
        types.extend(l['y_str'])
    
    abs_freq = dict(Counter(types))
    most_freq_type = max(abs_freq, key = abs_freq.get)
    rel_freq = abs_freq[most_freq_type] / len(lines)
    
    print('{} is the most frequent type, appears in the {:.2f} of sentences'.format(most_freq_type, rel_freq))

import numpy as np
def get_average_type(lines):
    types_number = []
    
    for l in lines:
        types_number.append(len(l['y_str']))
    
    print('There are {:.2f} types x example on average'.format(np.mean(types_number)))

In [49]:
train_dataset_path = '/datahdd/vmanuel/entity_typing_all_datasets/data/balanced_ontonotes/train.json'
dev_dataset_path = '/datahdd/vmanuel/entity_typing_all_datasets/data/balanced_ontonotes/dev.json'
test_dataset_path = '/datahdd/vmanuel/entity_typing_all_datasets/data/balanced_ontonotes/test.json'

In [50]:
train = load_dataset(train_dataset_path)
dev = load_dataset(dev_dataset_path)
test = load_dataset(test_dataset_path)

## Print lines for each dataset

In [51]:
print_lines(train)
print_lines(dev)
print_lines(test)

249592 lines in the dataset
712 lines in the dataset
735 lines in the dataset


## Get type number

In [52]:
get_type_number(train)
get_type_number(dev)
get_type_number(test)

87 types in this dataset
89 types in this dataset
88 types in this dataset


## Get Exclusive Types

In [53]:
get_exclusive_types(train, dev)
get_exclusive_types(train, test)

2 exclusive types in this dataset
1 exclusive types in this dataset


## Get Most Present Type

In [54]:
get_most_present_type(train)
get_most_present_type(dev)
get_most_present_type(test)


/other is the most frequent type, appears in the 0.37 of sentences
/other is the most frequent type, appears in the 0.49 of sentences
/other is the most frequent type, appears in the 0.47 of sentences


## Get Average Type x Example

In [55]:
get_average_type(train)
get_average_type(dev)
get_average_type(test)

There are 2.79 types x example on average
There are 3.29 types x example on average
There are 3.16 types x example on average
