In [1]:
import os
os.chdir('../../evaluate')
from evaluate.dsets.wikifactdiff import WikiFactDiffDataset

wfd = WikiFactDiffDataset(balance=False)

  from .autonotebook import tqdm as notebook_tqdm


Loaded dataset with 327688 elements


In [4]:
from collections import Counter

def is_addnewinfo(x):
        if x['subject_is_ph_new']:
            return False
        c = Counter(y['decision'] for y in x['objects'])
        return c['new'] > 0 and c['obsolete'] == 0 and c['stable'] == 0

def is_replace(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c.total() == 2 and c['new'] == c['obsolete'] == 1

def is_oblivion(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c.total() == 1 and c['obsolete'] == 1

def print_stats(dataset):
    
    n_triples = sum(len(x['objects']) for x in dataset)
    n_subjects = len(set(x['subject']['id'] for x in dataset))
    n_relations = len(set(x['relation']['id'] for x in dataset))
    relation_counter = Counter([x['relation']['id'] for x in dataset])
    def get_id(y, force_id=False):
        d = y.get('id', None)
        if d is not None or force_id:
            return d
        return y['label']
    n_objects = len(set(get_id(y) for x in dataset for y in x['objects']))
    n_entity_objects = len(set(get_id(y, force_id=True) for x in dataset for y in x['objects'] if get_id(y, force_id=True)))
    n_literal_objects = n_objects - n_entity_objects

    n_updates = len(dataset)
    
    n_replacement = sum(is_replace(x) for x in dataset)
    n_entity_insertion = sum(x['subject_is_ph_new'] for x in dataset)
    n_oblivion = sum(is_oblivion(x) for x in dataset)
    n_addnewinfo = sum(is_addnewinfo(x) for x in dataset)
    def is_addinfo(x):
        c = Counter(y['decision'] for y in x['objects'])
        return c['new'] > 0 and c['obsolete'] == 0 and c['stable'] > 0
    n_addinfo = sum(is_addinfo(x) for x in dataset)
    n_other = n_updates - n_replacement - n_entity_insertion - n_oblivion - n_addinfo - n_addnewinfo
    s = """
Triples = %s
Subjects = %s
Relations = %s
Objects = %s
Entity objects = %s
Literal objects = %s
===============================================
Updates = %s 
ReplaceObject = %s 
Archive = %s
AddObject = %s
AddRelation = %s 
AddEntity = %s 
Others = %s
    """ % (n_triples, n_subjects, n_relations, n_objects, n_entity_objects, n_literal_objects, n_updates, n_replacement, n_oblivion, n_addinfo,
        n_addnewinfo, n_entity_insertion, n_other)
    print(s)
    print()
    print('Top relations')
    print(relation_counter.most_common(20))

In [3]:
print_stats(wfd.data)


    Triples = 454365
    Subjects = 139811
    Relations = 675
    Objects = 111632
    Entity objects = 76015
    Literal objects = 35617
    Updates = 327688 
    ReplaceObject = 32875 
    Archive = 2798
    AddObject = 1533
    AddRelation = 155105 
    AddEntity = 132857 
    Others = 2520
    

Top relations
[('P1082', 57725), ('P1540', 34715), ('P1539', 34714), ('P570', 20205), ('P31', 15483), ('P39', 6496), ('P17', 6295), ('P577', 5808), ('P585', 5693), ('P1831', 5614), ('P361', 4718), ('P1476', 4566), ('P136', 4505), ('P155', 4266), ('P54', 4181), ('P495', 4064), ('P156', 4034), ('P641', 3443), ('P580', 3090), ('P1344', 3008)]


In [10]:
wfd_repl = WikiFactDiffDataset(balance=True, functional_only=True)

Keep only replace updates.
Undersample updates on the "population" relation by a factor of 14
Loaded dataset with 10373 elements


In [11]:
print_stats(wfd_repl.data)


Triples = 20746
Subjects = 9791
Relations = 157
Objects = 12403
Entity objects = 5578
Literal objects = 6825
Updates = 10373 
ReplaceObject = 10373 
Archive = 0
AddObject = 0
AddRelation = 0 
AddEntity = 0 
Others = 0
    

Top relations
[('P1082', 1689), ('P1087', 1419), ('P54', 1405), ('P3872', 1046), ('P131', 694), ('P39', 599), ('P286', 473), ('P6', 389), ('P1448', 233), ('P488', 217), ('P118', 184), ('P1128', 157), ('P8477', 130), ('P1308', 128), ('P8476', 121), ('P2139', 104), ('P102', 94), ('P31', 88), ('P6087', 81), ('P35', 62)]


In [12]:
# Sample replacements
count = 0
for x in wfd.data:
    if not x['is_replace']:
        continue
    o = x['objects']
    o.sort(key=lambda y : y['decision'])
    learn, forget = o
    print("(%s, %s, -%s, +%s)" % (x['subject']['label'], x['relation']['label'], forget['label'], learn['label']))
    count += 1
    if count > 200:
        break

(United States of America, head of government, -Donald Trump, +Joe Biden)
(Cristiano Ronaldo, league, -Premier League, +Saudi Professional League)
(India, head of state, -Ram Nath Kovind, +Droupadi Murmu)
(United Kingdom, head of government, -Boris Johnson, +Rishi Sunak)
(United Kingdom, head of state, -Elizabeth II, +Charles III)
(Lionel Messi, head coach, -Mauricio Pochettino, +Christophe Galtier)
(Manchester United F.C., head coach, -Ole Gunnar Solskjær, +Erik ten Hag)
(Meta Platforms, official name, -Facebook, Inc., +Meta Platforms, Inc.)
(Japan, age of majority, -20, +18)
(Japan, head of government, -Yoshihide Suga, +Fumio Kishida)
(New York City, head of government, -Bill de Blasio, +Eric Adams)
(Amazon, chief executive officer, -Jeff Bezos, +Andy Jassy)
(Canada, head of state, -Elizabeth II, +Charles III)
(Australia, head of government, -Scott Morrison, +Anthony Albanese)
(Dua Lipa, unmarried partner, -Anwar Hadid, +Jack Harlow)
(Maryland, head of government, -Larry Hogan, +Wes 