See annotation.readme for more information.

In [1]:
import pickle
import json
import re
import copy
import numpy as np
import spacy

In [2]:
with open('../data/1003_annotations.pickle', 'rb') as f:
    annotation = pickle.load(f)

In [3]:
for i, d in enumerate(annotation):
    annotation[i]['entry']['preprocess'] = re.sub('\r\n', '\n', d['entry']['preprocess'])
    if not annotation[i]['entry']['preprocess'].endswith('\n'):
        annotation[i]['entry']['preprocess'] += '\n'

# Fix errors

In [4]:
# Remove 'S2' from subjects of event[0] in dialog[20] turn[3]
print('Before:')
print(annotation[20]['with_token_anno'][3]['triples'][0]['subject'])
annotation[20]['with_token_anno'][3]['triples'][0]['subject'].pop(0)
print('After:')
print(annotation[20]['with_token_anno'][3]['triples'][0]['subject'])

Before:
[{'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '38'}, {'tokens': 'her', 'token_list': ['her'], 'group_num': '', 'tokens_id': '13'}, {'tokens': 'beauty under the tree', 'token_list': ['beauty', 'under', 'the', 'tree'], 'group_num': '', 'tokens_id': '22-25'}]
After:
[{'tokens': 'her', 'token_list': ['her'], 'group_num': '', 'tokens_id': '13'}, {'tokens': 'beauty under the tree', 'token_list': ['beauty', 'under', 'the', 'tree'], 'group_num': '', 'tokens_id': '22-25'}]


In [5]:
# Add 'S1' to the event in 3rd turn 
# Turn 2: Ingestion: ('S2',)|('12',) having with #1 (4,9#1) ('us', 'S1')|('10', '0') 
# Turn 3: Ingestion: ('S2',)|('12',) having with#1 (4,9#1) ('us',)|('10',) 
# Ignore other examples like this and deal with this problem in the clean_data.py: convert_dic
ex = annotation[22]['with_token_anno'][3]['triples'][1]['object']
print('Before:')
print(ex)
ex.append({'tokens': 'S1', 'token_list': ['S1'], 'group_num': '', 'tokens_id': '0'})
print('After:')
print(ex)

Before:
[{'token_list': ['us'], 'group_num': '', 'tokens': 'us', 'tokens_id': '10'}]
After:
[{'token_list': ['us'], 'group_num': '', 'tokens': 'us', 'tokens_id': '10'}, {'tokens': 'S1', 'token_list': ['S1'], 'group_num': '', 'tokens_id': '0'}]


In [6]:
# Remove ['travel from'] ['Canada'] which is duplicated with ['travel' 'from Canada']
ex = annotation[28]['with_token_anno'][2]['triples']
print('Before:')
print(ex)
ex.pop(1)
print('After:')
print(ex)

# Change framename from 'Motion' to 'Travel'
ex = annotation[28]['with_token_anno'][3]['triples'][1]
print('Before:')
print(ex)
ex['frame_name'] = ex['frame_name'].replace('Motion', 'Travel')
print('After:')
print(ex)

Before:
[{'eventType': 'explicit', 'polarity': 'pos', 'modality': 'actual', 'time': 'NOW', 'predicate': {'tokens': 'meet', 'token_list': ['meet'], 'group_num': '', 'tokens_id': '23'}, 'subject': [{'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '17'}], 'object': [{'tokens': 'you', 'token_list': ['you'], 'group_num': '', 'tokens_id': '24'}, {'tokens': 'S1', 'token_list': ['S1'], 'group_num': '', 'tokens_id': '0'}, {'tokens': 'Mr . Cooper', 'token_list': ['Mr', '.', 'Cooper'], 'group_num': '', 'tokens_id': '26-28'}], 'frame_name': 'Make_acquaintance', 'frame_candidates': ['Assemble', 'Meet_specifications', 'Make_acquaintance', 'Response', 'Meet_with_response', 'Locative_relation', 'Come_together']}, {'who': 'S2', 'eventType': 'implicit', 'subject': [{'token_list': ['You'], 'group_num': '', 'tokens': 'You', 'tokens_id': '41'}, {'token_list': ['S2'], 'group_num': '', 'tokens': 'S2', 'tokens_id': '17'}, {'token_list': ['Ms', '.', 'Wang'], 'group_num': '', 'tokens': 'Ms .

In [7]:
# Make "our company" --> "by our company" to match other entities
ex = annotation[34]['with_token_anno'][0]['triples'][2]['object'][0]
print('Before:')
print(ex)
ex['token_list'] = ['by'] + ex['token_list']
ex['tokens'] = 'by ' + ex['tokens']
ex['tokens_id'] = ex['tokens_id'].replace('3','2')
print('After:')
print(ex)

Before:
{'token_list': ['our', 'company'], 'group_num': '', 'tokens': 'our company', 'tokens_id': '13-14'}
After:
{'token_list': ['by', 'our', 'company'], 'group_num': '', 'tokens': 'by our company', 'tokens_id': '12-14'}


In [8]:
# Sync two events
ex = annotation[36]['with_token_anno'][1]['triples'][0]['object']
print(ex)
ex = annotation[36]['with_token_anno'][0]['triples'][0]['object']
print('Before:')
print(ex)
ex = annotation[36]['with_token_anno'][1]['triples'][0]['object']
print('After:')
print(ex)

[{'tokens': 'the position of an usher in your restaurant', 'token_list': ['the', 'position', 'of', 'an', 'usher', 'in', 'your', 'restaurant'], 'group_num': '', 'tokens_id': '9-16'}, {'tokens': 'S2(your)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '18(15)'}]
Before:
[{'tokens': 'the position of an usher in your restaurant', 'token_list': ['the', 'position', 'of', 'an', 'usher', 'in', 'your', 'restaurant'], 'group_num': '', 'tokens_id': '9-16'}]
After:
[{'tokens': 'the position of an usher in your restaurant', 'token_list': ['the', 'position', 'of', 'an', 'usher', 'in', 'your', 'restaurant'], 'group_num': '', 'tokens_id': '9-16'}, {'tokens': 'S2(your)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '18(15)'}]


In [9]:
# Sync two events
ex = annotation[40]['with_token_anno'][1]['triples'][0]['subject']
print(ex)
ex = annotation[40]['with_token_anno'][2]['triples'][0]['subject']
print('Before:')
print(ex)
ex = annotation[40]['with_token_anno'][1]['triples'][0]['subject']
print('After:')
print(ex)

[{'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '9'}, {'tokens': 'Peter', 'token_list': ['Peter'], 'group_num': '', 'tokens_id': '2'}]
Before:
[{'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '9'}]
After:
[{'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '9'}, {'tokens': 'Peter', 'token_list': ['Peter'], 'group_num': '', 'tokens_id': '2'}]


In [10]:
# Fix tokens_id:'57-59, 83' --> '57-59,83'
ex = annotation[84]['with_token_anno'][4]['triples'][0]['object'][0]
print('Before:')
print(ex)
ex['tokens_id'] = '57-59,83'
print('After:')
print(ex)

Before:
{'tokens': 'flight for the 22', 'token_list': ['flight', 'for', 'the', '22'], 'group_num': '', 'tokens_id': '57-59, 83'}
After:
{'tokens': 'flight for the 22', 'token_list': ['flight', 'for', 'the', '22'], 'group_num': '', 'tokens_id': '57-59,83'}


In [11]:
# remove "Hello , " to match other entities
ex = annotation[93]['with_token_anno']
print('Before:')
print(ex[1]['triples'][0]['object'][1])
for turn in ex:
    for triple in turn['triples']:
        for obj in triple['object']:
            if 'Hello , Mr . Smith' in obj['tokens']:
                obj['token_list'] = ['Mr', '.', 'Smith']
                obj['tokens'] = 'Mr . Smith'
                obj['tokens_id'] = '4-6'
print('After:')
print(ex[1]['triples'][0]['object'][1])

Before:
{'tokens': 'Hello , Mr . Smith', 'token_list': ['Hello', ',', 'Mr', '.', 'Smith'], 'group_num': '', 'tokens_id': '2-6'}
After:
{'tokens': 'Mr . Smith', 'token_list': ['Mr', '.', 'Smith'], 'group_num': '', 'tokens_id': '4-6'}


In [12]:
# Replace ['spend the weekend with us', 'S2(S1)'], tokens_id: ('6-10', '12(0)')
# as ['spend the weekend with us', 'S1(us)'], tokens_id: ('6-10', '0(10)')
ex = annotation[127]['with_token_anno']
new_obj = {
    'tokens': 'S2(us)',
    'token_list': ['S2'],
    'group_num': '',
    'tokens_id': '12(10)'
}
print('Before')
for i in range(1,4):
    print(ex[i]['triples'][0]['object'])
    ex[i]['triples'][0]['object'][1]['tokens'] = 'S1(us)'
    ex[i]['triples'][0]['object'][1]['token_list'] = 'S1'
    ex[i]['triples'][0]['object'][1]['tokens_id'] = '0(10)'
    
print('After')
for i in range(1,4):
    print(ex[i]['triples'][0]['object'])

Before
[{'tokens': 'spend the weekend with us', 'token_list': ['spend', 'the', 'weekend', 'with', 'us'], 'group_num': '', 'tokens_id': '6-10'}, {'tokens': 'S2(S1)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '12(0)'}]
[{'tokens': 'spend the weekend with us', 'token_list': ['spend', 'the', 'weekend', 'with', 'us'], 'group_num': '', 'tokens_id': '6-10'}, {'tokens': 'S2(S1)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '12(0)'}]
[{'tokens': 'spend the weekend with us', 'token_list': ['spend', 'the', 'weekend', 'with', 'us'], 'group_num': '', 'tokens_id': '6-10'}, {'tokens': 'S2(S1)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '12(0)'}]
After
[{'tokens': 'spend the weekend with us', 'token_list': ['spend', 'the', 'weekend', 'with', 'us'], 'group_num': '', 'tokens_id': '6-10'}, {'tokens': 'S1(us)', 'token_list': 'S1', 'group_num': '', 'tokens_id': '0(10)'}]
[{'tokens': 'spend the weekend with us', 'token_list': ['spend', 'the', 'weekend', 'with', 'us'], 'group_num'

In [13]:
# Remove space in tokens_id
# 14_3_2 {'tokens': 'try on', 'token_list': ['try', 'on'], 'group_num': '', 'tokens_id': '40, 42'}
# 94_3_3 {'tokens': 'convey to', 'token_list': ['convey', 'to'], 'group_num': '#3', 'tokens_id': '89, 92'}
# 94_4_3 {'tokens': 'convey to', 'token_list': ['convey', 'to'], 'group_num': '#3', 'tokens_id': '89, 92'}

# 213
# Predicate: {'token_list': ['been', 'with', 'the', 'company'], 'group_num': '', 'tokens': 'been with the company', 'tokens_id': '51-54'}
# make "been" the predicate and "with the company" the object

# 214
# Predicate: {'token_list': ['make', 'a', 'decision'], 'group_num': '', 'tokens': 'make a decision', 'tokens_id': '27-29'}
# make "make" the predicate and "a decision" the object

# 234
# Predicate: {'token_list': ['give', 'us'], 'group_num': '#1', 'tokens': 'give us#1', 'tokens_id': '38-39'}
# make "give" the predicate and "us" the object

# 284
# Predicate: {'token_list': ['dropped', 'it'], 'group_num': '#1', 'tokens': 'dropped it#1', 'tokens_id': '91-92'}
# "dropped" -> predicate; "it" -> object

# 285
# Predicate: {'token_list': ['helps', 'you'], 'group_num': '#1', 'tokens': 'helps you#1', 'tokens_id': '220-221'}
# "helps" -> predicate, "you" -> object

# 299
# Predicate: {'tokens': 'take a tour', 'token_list': ['take', 'a', 'tour'], 'group_num': '', 'tokens_id': '16-18'}
# "take" -> predicate, "a tour"-> object

# 366
# Predicate: {'tokens': 'be a member of', 'token_list': ['be', 'a', 'member', 'of'], 'group_num': '', 'tokens_id': '32-35'}

# 383
# Predicate: {'token_list': ['told', 'me', 'that'], 'group_num': '#1', 'tokens': 'told me that#1', 'tokens_id': '4-6'}
# Predicate: {'token_list': ['plays', 'the', 'field'], 'group_num': '', 'tokens': 'plays the field', 'tokens_id': '74-76'}

# 567
# Predicate: {'tokens': 'do exercise', 'token_list': ['do', 'exercise'], 'group_num': '', 'tokens_id': '86,88'}
# "do" -> predicate, "more exercise" -> object

# 573 Predicate: {'tokens': 'surf the Internet', 'token_list': ['surf', 'the', 'Internet'], 'group_num': '', 'tokens_id': '5-7'}


In [14]:
# Remove 'you(S1),44(46)' which is duplicated and in the wrong order which causes error
# Giving: ['me,38', 'S2,33'] leave #1 (40#1) ['the samples,41-42', 'you(S1),44(46)'] 
ex = annotation[214]['with_token_anno'][2]['triples'][2]
print('Before:')
print(ex['object'])
ex['object'].pop(1)
print('After:')
print(ex['object'])

ex = annotation[214]['with_token_anno'][3]['triples'][2]
print('Before:')
print(ex['object'])
ex['object'].pop(1)
print('After:')
print(ex['object'])

Before:
[{'tokens': 'the samples', 'token_list': ['the', 'samples'], 'group_num': '', 'tokens_id': '41-42'}, {'tokens': 'you(S1)', 'token_list': ['you'], 'group_num': '', 'tokens_id': '44(46)'}]
After:
[{'tokens': 'the samples', 'token_list': ['the', 'samples'], 'group_num': '', 'tokens_id': '41-42'}]
Before:
[{'tokens': 'the samples', 'token_list': ['the', 'samples'], 'group_num': '', 'tokens_id': '41-42'}, {'tokens': 'you(S1)', 'token_list': ['you'], 'group_num': '', 'tokens_id': '44(46)'}]
After:
[{'tokens': 'the samples', 'token_list': ['the', 'samples'], 'group_num': '', 'tokens_id': '41-42'}]


In [15]:
# Add missing token and fix the order of tokens_id
# [{'token_list': ['sure', 'you', 'can', 'do', 'it'], 'group_num': '', 'tokens': 'sure you can do it', 'tokens_id': '48-52'}, 
#  {'token_list': [''], 'group_num': '', 'tokens': '(S2)', 'tokens_id': '(26)49'}]
ex = annotation[219]['with_token_anno'][4]
print('Before:')
print(ex['triples'][1]['object'])
ex['triples'][1]['object'][1]['token_list'] = ['S2']
ex['triples'][1]['object'][1]['tokens'] = 'S2(you)'
ex['triples'][1]['object'][1]['tokens_id'] = '26(49)'
print('After:')
print(ex['triples'][1]['object'])

Before:
[{'token_list': ['sure', 'you', 'can', 'do', 'it'], 'group_num': '', 'tokens': 'sure you can do it', 'tokens_id': '48-52'}, {'token_list': [''], 'group_num': '', 'tokens': '(S2)', 'tokens_id': '(26)49'}]
After:
[{'token_list': ['sure', 'you', 'can', 'do', 'it'], 'group_num': '', 'tokens': 'sure you can do it', 'tokens_id': '48-52'}, {'token_list': ['S2'], 'group_num': '', 'tokens': 'S2(you)', 'tokens_id': '26(49)'}]


In [16]:
# Change framename from "Usefulness" to "Work"
for i in range(3,6):
    ex = annotation[374]['with_token_anno'][i]['triples'][0]
    print('Before')
    print(ex)
    ex['frame_name'] = 'Work'
    print('After')
    print(ex)
    print()

Before
{'who': 'Both', 'eventType': 'explicit', 'subject': [{'token_list': ['We'], 'group_num': '', 'tokens': 'We', 'tokens_id': '33'}, {'token_list': ['S2'], 'group_num': '', 'tokens': 'S2', 'tokens_id': '15'}, {'token_list': ['S1'], 'group_num': '', 'tokens': 'S1', 'tokens_id': '0'}], 'object': [{'token_list': ['like', 'bees', 'in', 'the', 'whole', 'working', 'days'], 'group_num': '', 'tokens': 'like bees in the whole working days', 'tokens_id': '35-41'}], 'predicate': {'token_list': ['worked'], 'group_num': '', 'tokens': 'worked', 'tokens_id': '34'}, 'time': 'BEFORE', 'polarity': 'pos', 'modality': 'actual', 'frame_candidates': ['Usefulness', 'Work', 'Being_operational', 'Being_employed', 'Working_a_post'], 'frame_name': 'Usefulness'}
After
{'who': 'Both', 'eventType': 'explicit', 'subject': [{'token_list': ['We'], 'group_num': '', 'tokens': 'We', 'tokens_id': '33'}, {'token_list': ['S2'], 'group_num': '', 'tokens': 'S2', 'tokens_id': '15'}, {'token_list': ['S1'], 'group_num': '', '

In [17]:
# your(S1) --> S1(your)
ex = annotation[408]['with_token_anno']
print('Before:')
print(ex[2]['triples'][-1]['object'])
print(ex[3]['triples'][-1]['object'])

ex[2]['triples'][-1]['object'][1]['tokens'] = 'S1(your)'
ex[2]['triples'][-1]['object'][1]['token_list'] = 'S1'
ex[2]['triples'][-1]['object'][1]['tokens_id'] = '58(55)'
ex[3]['triples'][-1]['object'][1]['tokens'] = 'S1(your)'
ex[3]['triples'][-1]['object'][1]['token_list'] = 'S1'
ex[3]['triples'][-1]['object'][1]['tokens_id'] = '58(55)'

print('After:')
print(ex[2]['triples'][-1]['object'])
print(ex[3]['triples'][-1]['object'])

Before:
[{'tokens': 'your friends', 'token_list': ['your', 'friends'], 'group_num': '', 'tokens_id': '55-56'}, {'tokens': 'your(S1)', 'token_list': ['your'], 'group_num': '', 'tokens_id': '(58)55'}]
[{'tokens': 'your friends', 'token_list': ['your', 'friends'], 'group_num': '', 'tokens_id': '55-56'}, {'tokens': 'your(S1)', 'token_list': ['your'], 'group_num': '', 'tokens_id': '(58)55'}]
After:
[{'tokens': 'your friends', 'token_list': ['your', 'friends'], 'group_num': '', 'tokens_id': '55-56'}, {'tokens': 'S1(your)', 'token_list': 'S1', 'group_num': '', 'tokens_id': '58(55)'}]
[{'tokens': 'your friends', 'token_list': ['your', 'friends'], 'group_num': '', 'tokens_id': '55-56'}, {'tokens': 'S1(your)', 'token_list': 'S1', 'group_num': '', 'tokens_id': '58(55)'}]


In [18]:
# remove "have" from the object
# {'token_list': ['have', 'dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'have dinner Saturday night', 'tokens_id': '6,11-13'}
for i in range(4):
    ex = annotation[413]['with_token_anno'][i]['triples'][3]['object'][0]
    print('Before:')
    print(ex)
    ex['token_list'] = ['dinner', 'Saturday', 'night']
    ex['tokens'] = 'dinner Saturday night'
    ex['tokens_id'] = '11-13'
    print('After:')
    print(ex)

for i in range(1, 4):
    ex = annotation[413]['with_token_anno'][i]['triples'][5]['object'][0]
    print('Before:')
    print(ex)
    ex['token_list'] = ['dinner', 'Saturday', 'night']
    ex['tokens'] = 'dinner Saturday night'
    ex['tokens_id'] = '11-13'
    print('After:')
    print(ex)


Before:
{'token_list': ['have', 'dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'have dinner Saturday night', 'tokens_id': '6,11-13'}
After:
{'token_list': ['dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'dinner Saturday night', 'tokens_id': '11-13'}
Before:
{'token_list': ['have', 'dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'have dinner Saturday night', 'tokens_id': '6,11-13'}
After:
{'token_list': ['dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'dinner Saturday night', 'tokens_id': '11-13'}
Before:
{'token_list': ['have', 'dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'have dinner Saturday night', 'tokens_id': '6,11-13'}
After:
{'token_list': ['dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'dinner Saturday night', 'tokens_id': '11-13'}
Before:
{'token_list': ['have', 'dinner', 'Saturday', 'night'], 'group_num': '', 'tokens': 'have dinner Saturday night', 'tokens_id': '6,11-13'}
After:
{'token_list': ['dinner', 

In [19]:
# remove S2(my) from ['my wife ’ s birthday party,12-17', 'Frank(my),23(12)', 'S2(my),19(12)']
ex = annotation[424]['with_token_anno']
print('Before:')
print(ex[1]['triples'][2]['object'])
print(ex[2]['triples'][2]['object'])
print(ex[3]['triples'][2]['object'])
for turn in ex[1:]:
    turn['triples'][2]['object'].pop(2)
print('After:')
print(ex[1]['triples'][2]['object'])
print(ex[2]['triples'][2]['object'])
print(ex[3]['triples'][2]['object'])

Before:
[{'tokens': 'my wife ’ s birthday party', 'token_list': ['my', 'wife', '’', 's', 'birthday', 'party'], 'group_num': '', 'tokens_id': '12-17'}, {'tokens': 'Frank(my)', 'token_list': ['Frank'], 'group_num': '', 'tokens_id': '23(12)'}, {'tokens': 'S2(my)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '19(12)'}]
[{'tokens': 'my wife ’ s birthday party', 'token_list': ['my', 'wife', '’', 's', 'birthday', 'party'], 'group_num': '', 'tokens_id': '12-17'}, {'tokens': 'Frank(my)', 'token_list': ['Frank'], 'group_num': '', 'tokens_id': '23(12)'}, {'tokens': 'S2(my)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '19(12)'}]
[{'tokens': 'my wife ’ s birthday party', 'token_list': ['my', 'wife', '’', 's', 'birthday', 'party'], 'group_num': '', 'tokens_id': '12-17'}, {'tokens': 'Frank(my)', 'token_list': ['Frank'], 'group_num': '', 'tokens_id': '23(12)'}, {'tokens': 'S2(my)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '19(12)'}]
After:
[{'tokens': 'my wife ’ s birthday 

In [20]:
# Remove 'S2' from Request: ('S1', 'you')|('0', '19') request to#1 () ('check the engine', 'It', 'S2')|('4-6', '10', '14')
ex = annotation[520]['with_token_anno'][1]['triples'][1]['object']
print(len(ex))
ex.pop(-1)
print(annotation[520]['with_token_anno'][1]['triples'][1]['object'])

3
[{'token_list': ['check', 'the', 'engine'], 'group_num': '', 'tokens': 'check the engine', 'tokens_id': '4-6'}, {'token_list': ['It'], 'group_num': '', 'tokens': 'It', 'tokens_id': '10'}]


In [21]:
ex = annotation[531]['with_token_anno'][2]['triples'][0]['object'][0]
print('Before:')
print(ex)
ex['tokens_id'] = ex['tokens_id'].replace(',23', '')
ex['tokens'] = ex['tokens'].replace(' and', '')
ex['token_list'].pop(1)
print('After:')
print(annotation[531]['with_token_anno'][2]['triples'][0]['object'][0])

Before:
{'tokens': '5 and as close as possible to the stage', 'token_list': ['5', 'and', 'as', 'close', 'as', 'possible', 'to', 'the', 'stage'], 'group_num': '', 'tokens_id': '21,23,27-33'}
After:
{'tokens': '5 as close as possible to the stage', 'token_list': ['5', 'as', 'close', 'as', 'possible', 'to', 'the', 'stage'], 'group_num': '', 'tokens_id': '21,27-33'}


In [22]:
print('Before:')
print(annotation[590]['with_token_anno'][3]['triples'][3]['object'][1])
tokens = "We built a snowman , some snow dogs and one big snow fort . We went sliding on the nice hill . It was a real work out . At noon , we had the most special winter picnic outside"
annotation[590]['with_token_anno'][3]['triples'][3]['object'][1]['tokens'] = tokens
annotation[590]['with_token_anno'][3]['triples'][3]['object'][1]['token_list'] = tokens.split(' ')
annotation[590]['with_token_anno'][3]['triples'][3]['object'][1]['tokens_id'] = '99-138'

annotation[590]['with_token_anno'][4]['triples'][3]['object'][1]['tokens'] = tokens
annotation[590]['with_token_anno'][4]['triples'][3]['object'][1]['token_list'] = tokens.split(' ')
annotation[590]['with_token_anno'][4]['triples'][3]['object'][1]['tokens_id'] = '99-138'

print('After:')
print(annotation[590]['with_token_anno'][3]['triples'][3]['object'][1])

Before:
{'tokens': 'We had a lot of fun there . There was so much joy and walk and breathe the winter air . We arrived at the park at 9 o ’ clock in the morning and didn ’ t leave till 3 o ’ clock in the afternoon We built a snowman , some snow dogs and one big snow fort . We went sliding on the nice hill . It was a real work out . At noon , we had the most special winter picnic outside', 'token_list': ['We', 'had', 'a', 'lot', 'of', 'fun', 'there', '.', 'There', 'was', 'so', 'much', 'joy', 'and', 'walk', 'and', 'breathe', 'the', 'winter', 'air', '.', 'We', 'arrived', 'at', 'the', 'park', 'at', '9', 'o', '’', 'clock', 'in', 'the', 'morning', 'and', 'didn', '’', 't', 'leave', 'till', '3', 'o', '’', 'clock', 'in', 'the', 'afternoon', 'We', 'built', 'a', 'snowman', ',', 'some', 'snow', 'dogs', 'and', 'one', 'big', 'snow', 'fort', '.', 'We', 'went', 'sliding', 'on', 'the', 'nice', 'hill', '.', 'It', 'was', 'a', 'real', 'work', 'out', '.', 'At', 'noon', ',', 'we', 'had', 'the', 'most', 'spe

In [23]:
# Add 'S2' to ['us', 'S1']
ex = annotation[769]['with_token_anno'][0]['triples'][0]['object']
print('Before:')
print(ex)
ex.append({'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '15'})
print('After:')
print(ex)

Before:
[{'token_list': ['us'], 'group_num': '', 'tokens': 'us', 'tokens_id': '13'}, {'token_list': ['S1'], 'group_num': '', 'tokens': 'S1', 'tokens_id': '0'}]
After:
[{'token_list': ['us'], 'group_num': '', 'tokens': 'us', 'tokens_id': '13'}, {'token_list': ['S1'], 'group_num': '', 'tokens': 'S1', 'tokens_id': '0'}, {'tokens': 'S2', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '15'}]


In [24]:
# 1. Remove ['my card,44-45', 'S2(my),47(44)'] from ['mine,54', 'my card,44-45', 'S2(my),47(44)']
#     from triples[0]
# 2. Change framename: "Assemble" to "Make_acquaintance" from triples[4] to match previous results
# 3. Remove ['my card,44-45', 'S2(my),47(44)'] from ['mine,54', 'my card,44-45', 'S2(my),47(44)']
#     from triples[8]
ex = annotation[771]['with_token_anno'][3]['triples'][0]['object']
print('Before:')
print(ex)
ex.pop(2)
ex.pop(1)
print('After:')
print(ex)

ex = annotation[771]['with_token_anno'][3]['triples'][4]
print('\nBefore:')
print(ex)
ex['frame_name'] = 'Make_acquaintance'
print('After:')
print(ex)

ex = annotation[771]['with_token_anno'][3]['triples'][8]['object']
print('\nBefore:')
print(ex)
ex.pop(2)
ex.pop(1)
print('After:')
print(ex)

Before:
[{'tokens': 'mine', 'token_list': ['mine'], 'group_num': '', 'tokens_id': '54'}, {'tokens': 'my card', 'token_list': ['my', 'card'], 'group_num': '', 'tokens_id': '44-45'}, {'tokens': 'S2(my)', 'token_list': ['S2'], 'group_num': '', 'tokens_id': '47(44)'}]
After:
[{'tokens': 'mine', 'token_list': ['mine'], 'group_num': '', 'tokens_id': '54'}]

Before:
{'who': 'Both', 'eventType': 'implicit', 'subject': [{'token_list': ['S2'], 'group_num': '', 'tokens': 'S2', 'tokens_id': '20'}], 'object': [{'token_list': ['you'], 'group_num': '', 'tokens': 'you', 'tokens_id': '33'}, {'token_list': ['S1'], 'group_num': '', 'tokens': 'S1', 'tokens_id': '0'}], 'predicate': {'tokens': 'meet', 'tokens_id': '', 'token_list': ['meet'], 'group_num': ''}, 'time': 'NOW', 'polarity': 'pos', 'modality': 'actual', 'frame_candidates': ['Assemble', 'Meet_specifications', 'Make_acquaintance', 'Response', 'Meet_with_response', 'Locative_relation', 'Come_together'], 'frame_name': 'Assemble'}
After:
{'who': 'Both

In [25]:
# Split the single entity as coreferences
ex = annotation[832]['with_token_anno'][3]['triples'][2]['object']
print('Before:')
print(ex)
ex[0]['tokens'] = 'enough'
ex[0]['token_list'] = ['enough']
ex[0]['tokens_id'] = '67'
ex.append({
    'tokens': 'cash',
    'token_list': ['cash'],
    'group_num': '',
    'tokens_id': '56',
})
print('After:')
print(ex)

Before:
[{'tokens': 'enough cash', 'token_list': ['enough', 'cash'], 'group_num': '', 'tokens_id': '67,56'}]
After:
[{'tokens': 'enough', 'token_list': ['enough'], 'group_num': '', 'tokens_id': '67'}, {'tokens': 'cash', 'token_list': ['cash'], 'group_num': '', 'tokens_id': '56'}]


In [26]:
# Split the single entity as coreferences
# Remove 'course' from 'philosophy course'
for i in range(1, 4):
    ex = annotation[914]['with_token_anno'][i]['triples'][0]['object'][1]
    print('Before:')
    print(ex)
    ex['tokens'] = 'philosophy'
    ex['token_list'] = ['philosophy']
    ex['tokens_id'] = '8'
    print('After:')
    print(ex)

Before:
{'tokens': 'philosophy course', 'token_list': ['philosophy', 'course'], 'group_num': '', 'tokens_id': '8,25'}
After:
{'tokens': 'philosophy', 'token_list': ['philosophy'], 'group_num': '', 'tokens_id': '8'}
Before:
{'tokens': 'philosophy course', 'token_list': ['philosophy', 'course'], 'group_num': '', 'tokens_id': '8,25'}
After:
{'tokens': 'philosophy', 'token_list': ['philosophy'], 'group_num': '', 'tokens_id': '8'}
Before:
{'tokens': 'philosophy course', 'token_list': ['philosophy', 'course'], 'group_num': '', 'tokens_id': '8,25'}
After:
{'tokens': 'philosophy', 'token_list': ['philosophy'], 'group_num': '', 'tokens_id': '8'}


In [27]:
# replace "drink" with "eat" in "drink soup"
print('Before:')
exs = [annotation[920]['with_token_anno'][i]['triples'][1]['predicate'] for i in range(2,len(annotation[920]['with_token_anno']))]
print(exs)
for ex in exs:
    ex['tokens'] = 'eat'
    ex['token_list'] = ['eat']
print('After:')
exs = [annotation[920]['with_token_anno'][i]['triples'][1]['predicate'] for i in range(2,len(annotation[920]['with_token_anno']))]
print(exs)

Before:
[{'tokens': 'drink', 'token_list': ['drink'], 'group_num': '', 'tokens_id': ''}, {'tokens': 'drink', 'tokens_id': '', 'token_list': ['drink'], 'group_num': ''}]
After:
[{'tokens': 'eat', 'token_list': ['eat'], 'group_num': '', 'tokens_id': ''}, {'tokens': 'eat', 'tokens_id': '', 'token_list': ['eat'], 'group_num': ''}]


# Split objects/subjects

In [28]:
"""
# Fix Annotation Error:
# Split objects (and subjects) to solve Case 3 (3)I.
# There are 25 cases in objects and 2 cases in subjects.
# Fortunately, there is no case where this happens in both subjects and objects.
# objects = [{X}, {Y}, {A(B)}]
    -- Case 1: dicts in the same object list --> X and Y are COREF
    -- Case 2: the dict in the same object list that contains "A(B)" means
                B is be a part of tokens in Y and B is the COREF of A.
                Note (1): there might be some error cases when A/B are exchanged. (A is a part of Y)
                Note (2): {A(B)} might have nothing to do with X.
                Note (3): there might be {A(B)} and {C(D)}, where B and D might in X or/and Y.
    -- Case 3: There are three forms of X's "tokens_id": 
                (1) "num"; (2) "num1-num2"; (3) "num3,num4,num5"
               --> (3) means 
                   I. there are multiple objects for the same (subject, predicate), 
                       e.g. ("S1", "S2"), or ("5", "as close as possible").
                   II. all tokens represent a meaning as all, 
                       e.g. ("flight for the", "22"), ("the", "pie").
               Problem: "tokens_id" is separated by ",", 
                   but the separation could not be identified in "tokens" and "token_list".
               Problem: We want to keep case II, but split case I. 
                    --> manually check..., save the II idx to "exceptions"

                The case of multiple objects is usually presented by different events 
                (sometimes with the same group id, but not always).
                E.g. Evt1 = {'object': ['tokens_id': num3]}; Evt2 = {'object': ['tokens_id': num4]}
"""
fix_exs = []
exceptions = [84, 388, 413, 500, 782, 832, 914]
for idx, d in enumerate(annotation):
    if idx in exceptions:
        continue
    for tid, turn in enumerate(d['with_token_anno']):
        for trid, triple in enumerate(turn['triples']):
            for eid, entity in enumerate(triple['subject']):
                if ',' in entity['tokens_id']:
                    fix_exs.append((idx, tid, trid, 'subject', eid))
            for eid, entity in enumerate(triple['object']):
                if ',' in entity['tokens_id']:
                    fix_exs.append((idx, tid, trid, 'object', eid))
print(len(fix_exs))
print(fix_exs)

for (idx, tid, trid, target, eid) in fix_exs:
    triple = annotation[idx]['with_token_anno'][tid]['triples'][trid]
    dialog = annotation[idx]['entry']['preprocess'].split()

    tokens_id_lst = triple[target][eid]['tokens_id'].split(',')
    for tokens_id in tokens_id_lst:
        tokens_id = tokens_id.strip()
        new_triple = copy.deepcopy(triple)
        new_triple[target][eid]['tokens_id'] = tokens_id
        if '-' in tokens_id:
            soi, eoi = tokens_id.split('-') # if tokens_id is "num1-num2"
            token_list = dialog[int(soi):int(eoi)+1]
            new_triple[target][eid]['token_list'] = token_list
            new_triple[target][eid]['tokens'] = ' '.join(token_list)
        elif not re.findall('\D', tokens_id): # if tokens_id is a single number
            new_triple[target][eid]['token_list'] = dialog[int(tokens_id)]
            new_triple[target][eid]['tokens'] = dialog[int(tokens_id)]
        else:
            print('Error')
            print(idx, tid, trid, eid, target)
            print(tokens_id_lst)
        annotation[idx]['with_token_anno'][tid]['triples'].append(new_triple)
    annotation[idx]['with_token_anno'][tid]['triples'].pop(trid)

27
[(21, 2, 0, 'subject', 0), (21, 3, 0, 'subject', 0), (505, 0, 0, 'object', 0), (505, 1, 0, 'object', 0), (505, 2, 0, 'object', 0), (505, 2, 1, 'object', 0), (505, 2, 3, 'object', 0), (505, 3, 0, 'object', 0), (505, 3, 1, 'object', 0), (505, 3, 3, 'object', 0), (531, 2, 0, 'object', 0), (531, 3, 1, 'object', 0), (531, 4, 2, 'object', 0), (586, 2, 1, 'object', 0), (586, 3, 1, 'object', 0), (586, 4, 1, 'object', 0), (596, 0, 0, 'object', 0), (596, 1, 0, 'object', 0), (596, 2, 0, 'object', 0), (596, 3, 0, 'object', 0), (596, 4, 0, 'object', 0), (709, 3, 2, 'object', 0), (710, 0, 0, 'object', 0), (710, 1, 0, 'object', 0), (710, 1, 1, 'object', 0), (710, 2, 0, 'object', 0), (710, 2, 1, 'object', 0)]


# Adjust fields

## Get DailyDialog Split

In [29]:
with open('../data/dailydialog_id2split.json') as f:
    dailydialog_id2split = json.load(f) # {string: string}, e.g. {'0': train}
id_map = json.load(open('../data/id_map.json')) # {string: int}, e.g. {'0': 0}

In [30]:
def get_split_index(split_dic, daily_idx, counter_dic):
    split = split_dic[str(daily_idx)]
    counter_dic[split] += 1
    return split, counter_dic

In [31]:
all_data = []
cnt = {'train': 0, 'valid': 0, 'test': 0}
for d in annotation:
    new_annotation = {}
    new_annotation['source_id'] = d['entry']['source_id']
    
    daily_idx = id_map[d['entry']['source_id']]
    split, cnt = get_split_index(dailydialog_id2split, daily_idx, cnt)
    new_annotation['DailyDialog_id'] = daily_idx
    new_annotation['split'] = split
    new_annotation['dialog_id'] = cnt[split]
    
    new_annotation['dialogue'] = d['entry']['preprocess']
    
    dialog_events = []
    remove_events = []
    for turn in d['with_token_anno']:
        if turn['checkEvent'] != 'hasEvent':
            for event_idx, event in enumerate(turn['triples']):
                remove_events.append(event)
            dialog_events.append([])
            continue
            
        new_evts_per_turn = []        
        for event in turn['triples']:
            if event in remove_events:
                continue
                
            event_dict = {
                'participants': {},
                'event_status': {},
                'event_info': {},
            }
            
            event_dict['participants']['predicate'] = event['predicate']
            event_dict['participants']['subjects'] = event['subject']
            event_dict['participants']['objects'] = event['object']

            event_dict['event_status']['polarity'] = 1 if event['polarity'] == 'pos' else 0
            event_dict['event_status']['modality'] = 1 if event['modality'] == 'actual' else 0
            event_dict['event_status']['time'] = event['time']
            event_dict['event_status']['who'] = event.get('who', None)
            
            event_dict['event_info']['explicit'] = 1 if event['eventType'] == 'explicit' else 0
            event_dict['event_info']['frame_name'] = event['frame_name']
            
            new_evts_per_turn.append(event_dict)
        dialog_events.append(new_evts_per_turn)
    assert len(dialog_events) == len(d['with_token_anno']), f"len(dialog_events) {len(dialog_events)} should be {len(d['with_token_anno'])}"
    new_annotation['events'] = dialog_events
    all_data.append(new_annotation)

# Coreferences

## Notes:

Fields of original subject/object:
```
{
        "tokens": Str,
        "token_list": List, # list of tokens
        "group_num": Str,
        "tokens_id": Str
}
```

Formats of "tokens_id":
 - "14-15"
 - "17"
 - "60(32)" # replace 32 with 60
 - "6-7(31)" # replace 31 with 6-7
 - "21(4-5)" # replace 4-5 with 21
 - "52,54" # not a continuous span, e.g. 782-2-0,832-3-2
 - "57-59, 83" # not a continuous span, e.g. 84-4-0
 
 

## Remove "S1" coreference to "S2"
Both "S1" and "S2" should not be in the same subjects/objects because the entities in the subjects/objects list means they are coreferences. Therefore I remove the "S1", "S2" from such case.

In [32]:
for anno_id, d in enumerate(all_data):
    for turn_id, turn in enumerate(d['events']):
        for event_id, event in enumerate(turn):
            tokens = [p['tokens'] for p in event['participants']['subjects']]
            if 'S1' in tokens and 'S2' in tokens:
                event['participants']['subjects'] = [p for p in event['participants']['subjects'] if p['tokens'] not in ['S1', 'S2']]
            
            tokens = [p['tokens'] for p in event['participants']['objects']]
            if 'S1' in tokens and 'S2' in tokens:
                event['participants']['objects'] = [p for p in event['participants']['objects'] if p['tokens'] not in ['S1', 'S2']]

## Remove group annotations and make sure there is no extra empty token in each participant in order to get the correct coreference results

In [33]:
def check_format(participant, ex_id):
    tokens = participant['tokens'].strip()
    token_list = [token for token in participant['token_list'] if token]
    tokens_id = participant['tokens_id'].strip()
    if len(tokens) != len(participant['tokens']) or len(token_list) != len(participant['token_list']) \
     or len(tokens_id) != len(participant['tokens_id']):
#         print(ex_id)
#         print('before:', participant)
        participant.update({
            "tokens": tokens,
            "token_list": token_list,
            "tokens_id": tokens_id,
        })
#         print('after :', participant)
#         print()
    return participant

In [34]:
for anno_idx, d in enumerate(all_data):
    for turn_idx, turn in enumerate(d['events']):
        for event_idx, event in enumerate(turn):
            # Remove group annotations
            event['participants']['predicate']['tokens'] = re.sub('#[0-9]','', event['participants']['predicate']['tokens'])
            event['participants']['predicate']['tokens_id'] = re.sub('#[0-9]','', event['participants']['predicate']['tokens_id'])
            # check the format
            event['participants']['predicate'] = check_format(event['participants']['predicate'], f"{anno_idx}_{turn_idx}_{event_idx}")
            
            for pid, p in enumerate(event['participants']['subjects']):
                p['tokens'] = re.sub('#[0-9]','', p['tokens'])
                p['tokens_id'] = re.sub('#[0-9]','', p['tokens_id'])
            # check the format
            event['participants']['subjects'] = [check_format(sbj, f"{anno_idx}_{turn_idx}_{event_idx}") for sbj in event['participants']['subjects']]

            for p in event['participants']['objects']:
                p['tokens'] = re.sub('#[0-9]','', p['tokens'])
                p['tokens_id'] = re.sub('#[0-9]','', p['tokens_id'])
            # check the format
            event['participants']['objects'] = [check_format(obj, f"{anno_idx}_{turn_idx}_{event_idx}") for obj in event['participants']['objects']]


## Get coreference clusters

In [35]:
def add_coref_cluster(entities, coreferences, visit):
    if not any(entities):
        return coreferences, visit
    # Force 'S1' to be in the 1st cluster and 'S2' the 2nd
    # check if any subject in subjects is already in the cluster
    coref_id = None
    for entity in entities:
        if entity['tokens'] == 'S1':
            coref_id = 0
            if entity['tokens_id'] not in visit:
                visit[entity['tokens_id']] = coref_id
                coreferences[coref_id].append(entity['tokens_id'])
            break
        elif entity['tokens'] == 'S2':
            coref_id = 1
            if entity['tokens_id'] not in visit:
                visit[entity['tokens_id']] = coref_id
                coreferences[coref_id].append(entity['tokens_id'])
            break
        if entity['tokens_id'] in visit:
            coref_id = visit[entity['tokens_id']]
            break
            
    # add the subject_id to the cluster
    for entity in entities:
        tokens_id = entity['tokens_id']
        if '(' in tokens_id:
            add_to_coref_clusters = False
            mention_id, replace_id = tokens_id.rstrip(')').split('(')
            mention_token, replace_token = entity['tokens'].rstrip(')').split('(')
            if mention_id in visit and replace_id not in visit:
                coreferences[visit[mention_id]].append(replace_id)
                visit[replace_id] = visit[mention_id]
            elif replace_id in visit and mention_id not in visit:
                coreferences[visit[replace_id]].append(mention_id)
                visit[mention_id] = visit[replace_id]
            elif mention_token == 'S1' or replace_token == 'S1':
                coref_id = 0
                if mention_id not in visit:
                    visit[mention_id] = coref_id
                    coreferences[coref_id].append(mention_id)
                if replace_id not in visit:
                    visit[replace_id] = coref_id
                    coreferences[coref_id].append(replace_id)
            elif mention_token == 'S2' or replace_token == 'S2':
                coref_id = 1
                if mention_id not in visit:
                    visit[mention_id] = coref_id
                    coreferences[coref_id].append(mention_id)
                if replace_id not in visit:
                    visit[replace_id] = coref_id
                    coreferences[coref_id].append(replace_id)
            else:
                coref_id = len(coreferences)
                visit[mention_id] = coref_id
                visit[replace_id] = coref_id
                coreferences.append([mention_id, replace_id])
        elif coref_id is None and len(tokens_id) > 0:
            coref_id = len(coreferences)
            visit[tokens_id] = coref_id
            coreferences.append([tokens_id])
        elif coref_id is not None and tokens_id not in visit:
            coreferences[coref_id].append(tokens_id)
            visit[tokens_id] = coref_id
    return coreferences, visit

In [36]:
def add_id2token(participant, id2token):
    for p in participant:
        if '(' in p['tokens_id']:
            id1, id2 = p['tokens_id'].rstrip(')').split('(')
            tok1, tok2 = p['tokens'].rstrip(')').split('(')
            if id1 not in id2token:
                id2token[id1] = tok1
            if id2 not in id2token:
                id2token[id2] = tok2
        elif p['tokens_id'] not in id2token:
            id2token[p['tokens_id']] = p['tokens']
    return id2token

In [37]:
for anno_id, d in enumerate(all_data):
    d['coreferences'] = [[], []] # a list of list of clusters
    d['tokensid2corefid'] = {} # tokens_id: coref_id (Str:Int)
    d['tokensid2tokens'] = {}
    for turn_id, turn in enumerate(d['events']):
        for event_id, event in enumerate(turn):
            d['coreferences'], d['tokensid2corefid'] = add_coref_cluster(event['participants']['subjects'], d['coreferences'], d['tokensid2corefid'])
            d['coreferences'], d['tokensid2corefid'] = add_coref_cluster(event['participants']['objects'], d['coreferences'], d['tokensid2corefid'])
            
            d['tokensid2tokens'] = add_id2token(event['participants']['subjects'], d['tokensid2tokens'])
            d['tokensid2tokens'] = add_id2token(event['participants']['objects'], d['tokensid2tokens'])
            
    dialogue_tokens = d['dialogue'].replace('\n', ' ').split(' ')
    d['coreferences_tokens'] = []
    for cid, cluster in enumerate(d['coreferences']):
        d['coreferences_tokens'].append([])
        for tokens_id in cluster:
            d['coreferences_tokens'][cid].append(d['tokensid2tokens'][tokens_id])

## Replace entity list (subjects/objects) to a single entity (subject/object)
The goal is to identify the same event regardless of status changes.

Procedure:
1. Select one representative entity from the entity list.
    Criterion:
    - The token_id is the nearest to the predicate's tokens_id
    - In the case of implicit event where there is no predicate tokens_id, we select the token_id closest to the start token id of that turn.
    - Contains "S1" or "S2"

2. Replace the entity list (subjects/objects) to the dictionary of the selected entity, with additional "entity_id" field.
```
{
    "entity_id": Int, # which is also the index of cluster in the "coreferences" field
    "tokens": Str,
    "token_list": List,
    "group_num": Str,
    "tokens_id": Str
}
```
{
        "tokens": Str,
        "token_list": List, # list of tokens
        "group_num": Str,
        "tokens_id": Str
}

In [38]:
omit_lst = ['at', 'in', 'from', 'on', 'has', 'is', 'was', 'up', 'with', 'for', 'about', 'out', 'to', 'of', 'as', 'by', 'go']

def get_span(participant):
    if not participant['tokens_id'] or '(' in participant['tokens_id']:
        return None, None
    
    start_idx, end_idx = None, None
    
    if ',' in tokens_id:
        try:
            id1, id2 = tokens_id.split(',')
            if '-' in id1:
                sub_id1, sub_id2 = get_span({'tokens_id': id1, 'token_list': participant['token_list'][:-1]})
                return min(sub_id1, int(id2)), max(sub_id2, int(id2))
            elif '-' in id2:
                sub_id1, sub_id2 = get_span({'tokens_id': id2, 'token_list': participant['token_list'][1:]})
                return min(sub_id1, int(id1)), max(sub_id2, int(id1))
            token1, token2 = participant['token_list']
            if token1 in omit_lst:
                return get_span({'tokens_id': id2, 'token_list': token2})
            elif token2 in omit_lst:
                return get_span({'tokens_id': id1, 'token_list': token1})
            else:
                return int(id1), int(id2)
        except:
            import ipdb;ipdb.set_trace()
    else:
        arg_ids = tokens_id.split('-')
        try:
            if len(arg_ids) == 1:
                start_idx, end_idx = int(arg_ids[0]), int(arg_ids[0])
            elif len(arg_ids) == 2:
                start_idx, end_idx = int(arg_ids[0]), int(arg_ids[1])
            return start_idx, end_idx
        except:
            import ipdb;ipdb.set_trace()
    if not start_idx:
        import ipdb;ipdb.set_trace()
    return start_idx, end_idx

def select_representative_entity(entity_type, entity_lst, verb, turn_start, turn_end):
    if len(entity_lst) == 1:
        return entity_lst[0]
    
    verb_start, verb_end = get_span(verb)
    
    min_distance = 1000
    selected_entity = None
    
    for entity in entity_lst:
        tokens_id = entity['tokens_id']
        if '(' in tokens_id:
            continue
        elif ',' in tokens_id:
            id1, id2 = tokens_id.split(',')
            try:
                max_id = max(int(id1), int(id2))
            except:
                try:
                    max_id = max(int(id1.strip().split('-')[-1]), int(id2.strip().split('-')[-1]))
                except:
                    import ipdb;ipdb.set_trace()
            
            if max_id > turn_end:
                continue
            
            if verb_start:
                # explicit event
                if entity_type == 'sbj':
                    distance = abs(verb_start - max_id)
                elif entity_type == 'obj':
                    distance = abs(max_id - verb_end)
            else:
                # implicit event
                if entity_type == 'sbj':
                    distance = abs(max_id - turn_start)
                elif entity_type == 'obj':
                    distance = abs(turn_end - max_id)
        else:
            entity_start, entity_end = get_span(entity)
            if verb_start:
                if entity_type == 'sbj':
                    distance = abs(entity_end - verb_start)
                elif entity_type == 'obj':
                    distance = abs(entity_start - verb_end)
            else:
                # implicit event
                if entity_type == 'sbj':
                    distance = abs(entity_start - turn_start)
                elif entity_type == 'obj':
                    distance = abs(turn_end - entity_end)
                
        if distance < min_distance:
            min_distance = distance
            selected_entity = entity
    return selected_entity

In [39]:
for anno_id, d in enumerate(all_data):
    print('\r', anno_id, end='')
    sents = d['dialogue'].split('\n')
    sent_lens = [len(sent.split(' ')) for sent in sents]
    start_token_ids = np.cumsum([0] + sent_lens[:-1])
    
    for turn_id, turn in enumerate(d['events']):
        start_turn_token_id = start_token_ids[turn_id]
        end_turn_token_id = start_token_ids[turn_id+1] - 1
            
        for event_id, event in enumerate(turn):
            # Subject
            selected_subject = select_representative_entity('sbj', event['participants']['subjects'], event['participants']['predicate'], start_turn_token_id, end_turn_token_id)
            if not selected_subject:
                print(anno_id, turn_id, )
                print(event['participants']['subjects'], event['participants']['predicate'], start_turn_token_id, end_turn_token_id)
            event['participants']['subject'] = {
                "entity_id": d['tokensid2corefid'][selected_subject['tokens_id']],
            }
            event['participants']['subject'].update(selected_subject)
            
            # Object
            event['participants']['object'] = {}
            if event['participants']['objects'][0]['tokens_id']:
                selected_object = select_representative_entity('obj', event['participants']['objects'], event['participants']['predicate'], start_turn_token_id, end_turn_token_id)
                event['participants']['object']['entity_id'] = d['tokensid2corefid'][selected_object['tokens_id']]
                event['participants']['object'].update(selected_object)

 1002

## Obtain predicate class from its lemma

In [40]:
import spacy
spacy_model = spacy.load("en_core_web_sm")

In [41]:
def remove_preposition(predicate):
    remove_postfix = ['to', 'for', 'with', 'up', 'at', 'from', 'about', 'out', 'by', 'in', 'of', 'as', 'by']
    remove_postfix = ' ' + '| '.join(remove_postfix)
    return re.sub(f'({remove_postfix})$', '', predicate.strip())

In [42]:
for anno_id, d in enumerate(all_data):
    for turn_id, turn in enumerate(d['events']):
        for event_id, event in enumerate(turn):            
            verb_tokens = remove_preposition(event['participants']['predicate']['tokens'])
            verb_class = ' '.join([token.lemma_ for token in spacy_model(verb_tokens)])
            event['event_info']['predicate_class'] = verb_class

# Save new annotation file

In [43]:
with open('../data/annotations.pickle', 'wb') as f:
#     pickle.dump(annotation, f)
    pickle.dump(all_data, f)

In [44]:
print(all_data[0].keys())

dict_keys(['source_id', 'DailyDialog_id', 'split', 'dialog_id', 'dialogue', 'events', 'coreferences', 'tokensid2corefid', 'tokensid2tokens', 'coreferences_tokens'])


In [45]:
with open('../data/annotation_train.jsonl', 'w') as train_f, open('../data/annotation_valid.jsonl', 'w') as valid_f,  open('../data/annotation_test.jsonl', 'w') as test_f:
    for anno_id, d in enumerate(all_data):
        for turn_id, turn in enumerate(d['events']):
            for event_id, event in enumerate(turn):
                del event['participants']['subjects']
                del event['participants']['objects']
        del d['source_id']
        del d['tokensid2corefid']
        del d['tokensid2tokens']
        if d['split'] == 'train':
            train_f.write(json.dumps(d) + '\n')
        elif d['split'] == 'valid':
            valid_f.write(json.dumps(d) + '\n')
        elif d['split'] == 'test':
            test_f.write(json.dumps(d) + '\n')