In [None]:
import json
from build.config import STORAGE_FOLDER
import os.path as osp 

with open(osp.join(STORAGE_FOLDER,'wikifactdiff.jsonl')) as f:
    wfd = [json.loads(x) for x in f]

In [None]:
from collections import Counter
def print_stats(dataset):
    n_triples = sum(len(x['objects']) for x in dataset)
    n_subjects = len(set(x['subject']['id'] for x in dataset))
    n_relations = len(set(x['relation']['id'] for x in dataset))
    def get_id(y, force_id=False):
        d = y.get('id', None)
        if d is not None or force_id:
            return d
        return y['label']
    n_objects = len(set(get_id(y) for x in dataset for y in x['objects']))
    n_entity_objects = len(set(get_id(y, force_id=True) for x in dataset for y in x['objects'] if get_id(y, force_id=True)))
    n_literal_objects = n_objects - n_entity_objects

    n_updates = len(dataset)
    def is_replace(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c.total() == 2 and c['learn'] == c["forget"] == 1
    n_replacement = sum(is_replace(x) for x in dataset)
    n_entity_insertion = sum(x['subject_is_ph_new'] for x in dataset)
    def is_oblivion(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c.total() == 1 and c["forget"] == 1
    n_oblivion = sum(is_oblivion(x) for x in dataset)
    def is_addnewinfo(x):
        if x['subject_is_ph_new']:
            return False
        c = Counter(y['decision'] for y in x['objects'])
        return c["learn"] > 0 and c['forget'] == 0 and c['keep'] == 0
    n_addnewinfo = sum(is_addnewinfo(x) for x in dataset)
    def is_addinfo(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c["learn"] > 0 and c['forget'] == 0 and c['keep'] > 0
    n_addinfo = sum(is_addinfo(x) for x in dataset)
    n_other = n_updates - n_replacement - n_entity_insertion - n_oblivion - n_addinfo - n_addnewinfo
    s = """
    Triples = %s
    Subjects = %s
    Relations = %s
    Objects = %s
    Entity objects = %s
    Literal objects = %s
    ===============================================
    Updates = %s 
    Replacements = %s 
    EntityInsertion = %s 
    Oblivion = %s 
    AddNewInfo = %s 
    AddInfo = %s 
    Others = %s
    """ % (n_triples, n_subjects, n_relations, n_objects, n_entity_objects, n_literal_objects, n_updates, n_replacement, n_entity_insertion, n_oblivion,
        n_addnewinfo, n_addinfo, n_other)
    print(s)

In [None]:
print_stats(wfd)

In [None]:
# Sample replacements
count = 0
for x in wfd:
    if not x['is_replace']:
        continue
    o = x['objects']
    o.sort(key=lambda y : y['decision'])
    forget, learn = o
    print("(%s, %s, -%s, +%s)" % (x['subject']['label'], x['relation']['label'], forget['label'], learn['label']))
    count += 1
    if count > 200:
        break

In [None]:
wfd_repl = [x for x in wfd if x['is_replace']]

In [None]:
print_stats(wfd_repl)