In [6]:
import importlib
import pandas as pd
import pickle
import json

from utils import *
import delphin.codecs.eds

# import networkx as nx
# import EnhancedEDS
# importlib.reload(find_eds_by_ids_df)
# importlib.reload(find_semlink_by_ids_df)
# importlib.reload(find_tree_by_ids_df)
from pprint import pprint

In [10]:
sentences = pd.read_csv('sentences.csv')
semlink_map = pd.read_csv('sl_mappings.csv')
trees = pd.read_csv('trees.csv')

In [15]:
sentences.head()

Unnamed: 0,id,sentence,constituency,eds,dependency
0,20001001,"Pierre Vinken, 61 years old, will join the boa...","(S\n (NP\n (N (N (NP (N (N ""pierre""))) (N (N ...",{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound...,{\n proper_q<0:28> BV named<7:14>(Vinken) \n...
1,20001002,"Mr. Vinken is chairman of Elsevier N.V., the D...","(S (NP (N (N (N ""mr."")) (N (N ""Vinken""))))\n (...",{e3:\n _1:proper_q<0:10>[BV x6]\n e10:compound...,{\n proper_q<0:10> BV named<4:10>(Vinken) \n...
2,20003001,A form of asbestos once used to make Kent ciga...,"(S\n (S\n (NP (DET ""a"")\n (N (N (N (N ""form...","{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...",{\n focus_d<0:190> ARG1 _report_v_to<181:190>...
3,20003002,"The asbestos fiber, crocidolite, is unusually ...","(S\n (S\n (S\n (S\n (NP (NP (DET ""the"") ...","{e3:\n e5:focus_d<0:176>{e SF prop, TENSE unte...",{\n focus_d<0:176> ARG1 _say_v_to<171:176> \...
4,20003003,"Lorillard Inc., the unit of New York-based Loe...","(S\n (NP (NP (N (N (N ""Lorillard"")) (N (N ""inc...","{e3:\n e7:appos<0:82>{e SF prop, TENSE untense...",{\n appos<0:82> ARG1 named<0:9>(Lorillard) \...


In [14]:
semlink_map.head()

Unnamed: 0,id,token_id,is_gold,vb_form,vn_class_index,fn_frame,pb_sense,not_sure_0,not_sure_1,augmentations
0,20001001,8,gold,join-v,22.1-2-1,Cause_to_amalgamate,join.01,,-----,"['0:2-ARG0=Agent;Agent', '7:0-ARGM-MOD', '8:0-..."
1,20002001,16,gold,name-v,29.3,IN,name.01,,-----,"['0:2*17:1-ARG1=Theme', '16:0-rel', '18:2-ARG2..."
2,20003001,9,gold,make-v,26.1-1,Manufacturing,make.01,,-----,"['7:1-ARG0=Agent;Manufacturer', '9:0-rel', '10..."
3,20003001,14,gold,cause-v,27,Causation,cause.01,1.0,-----,"['0:3-ARG0=Cause;Cause', '14:0-rel', '15:2-ARG..."
4,20003001,37,gold,report-v,37.7-1,Statement,report.01,,-----,"['36:1-ARG0=Agent;Speaker', '37:0-rel', '38:1-..."


In [13]:
trees = pd.read_csv('trees.csv')
trees.head()

Unnamed: 0,id,tree
0,20001001,(TOP (S (NP-SBJ (NP (NNP Pierre)\n ...
1,20001002,(TOP (S (NP-SBJ (NNP Mr.)\n (NN...
2,20001003,
3,20002001,(TOP (S (NP-SBJ-1 (NP (NNP Rudolph)\n ...
4,20002002,


In [158]:
def search_eds_nodes_by_id (nodes, idd):
    for n in nodes:
        if n.id == idd:
            return n
    return None
def string_of_list_to_list(string_of_list):
    return string_of_list.strip('[]').replace('\'', '').replace('\"', '').split(', ')

In [37]:
def enhance_eds(eds, semlinks):
    '''
    input: EDS, [dict]
    return: EDS, unmapped semlinks
    '''
    node_index = 0
    semlink_index = 0
    while True:
        if (node_index == len(eds.nodes) or semlink_index == len(semlinks)):
            break
            
        cur_sl = semlinks[semlink_index]
        cur_node = eds.nodes[node_index]
        
        if '_'.join(cur_sl['vb_form'].split('-')) in cur_node.predicate:    
#             enhance node
#             print('augmenting node')
#             print(cur_node.predicate)
        
            if '-fn.' not in cur_node.predicate: 
                cur_node.predicate = cur_node.predicate + '-fn.' + cur_sl['fn_frame']
#             enhance edge
# arg is 0 indexed in semlink but 1 indexed in EDS
            for k in list(cur_node.edges):
                sl_arg_index = arg_number_decrease_by_one(k)
#                 augmentation string to list
                augmentations  = cur_sl['augmentations'].strip('[]').replace('\'', '').replace('\"', '').split(', ')
                for aug in augmentations:
                    if sl_arg_index in aug and '-fn.' not in k:
                    
#                         print('augmenting edge')
#                         print(k)
#                         print('using')
#                         print(aug)
#                         extract fn info
                        if ';' in aug:
                            fn_info = aug.split(';')[-1]
                            new_key = k + '-fn.' + fn_info
                            if k in cur_node.edges:  # not yet augmented
                                cur_node.edges[new_key] = cur_node.edges.pop(k) 
                            else:  # augmented, semlink key is the same
                                pass 
                        else:
#                             do nothing
                            pass
            semlink_index += 1
            
        node_index += 1
    
    return eds, semlinks[semlink_index:]

In [136]:
def process_augmentations(list_of_augmentations):
    '''return arg0, arg1... and frameNet roles if any'''
    extracted_augs = {}
    has_redundant_pb_role = False

    for aug in list_of_augmentations:
        # remove token
        token_span, rest = aug.split('-', 1)
        # best approximation of existence of pb role, vn role
        if 'ARG' in rest and '=' in rest:
            # extract pb_role
            pb_role, rest = rest.split('=', 1)
            # best approximation of existence of fn role
            if ';' in rest:
                fn_role = rest.split(';')[1]
                # it has redundant pb role possibily due to errornuous annotation
                if pb_role in extracted_augs:
                    has_redundant_pb_role = True
                    # overwrite with better information
                    if extracted_augs[pb_role] == '':
                        extracted_augs[pb_role] = fn_role
                    continue
                else:
                    extracted_augs[pb_role] = fn_role
            # there is no fn role
            else:
                # check not to overwrite better information
                if pb_role not in extracted_augs:
                    extracted_augs[pb_role] = ''
                
        # augmentation doesn't concern pb role, vn role
        else:
            pass
    
    return extracted_augs, has_redundant_pb_role

In [165]:
def find_node_ids_edge_targets(eds, semlinks):
    # one eds can have multiple semlinks
    # one semlink can augment multiple edges
    node_index = 0
    semlink_index = 0

    counter_redundant_pb = 0
    node_cannot_be_found = 0

    node_ids = []
    fn_frames = []
    edge_targets = []
    fn_roles = []

    while True:
        # cannot find corresponding verb in eds for a semlink
        if (node_index == len(eds.nodes) and semlink_index < len(semlinks)):
            node_cannot_be_found += 1
            # restart search from next semlink, from first node
            semlink_index += 1
            node_index = 0
            continue
        
        if semlink_index == len(semlinks):
            break
        cur_sl = semlinks[semlink_index]
        cur_augmentations = string_of_list_to_list(cur_sl['augmentations'])
        cur_node = eds.nodes[node_index]


        # predicate matches semlink vb form
        if '_'.join(cur_sl['vb_form'].split('-')) in cur_node.predicate:
            node_ids.append(cur_node.id)
            fn_frames.append(cur_sl['fn_frame'])

            # cur_verb_edge_labels = []
            cur_verb_edge_targets = []
            cur_verb_edge_fn_roles = []
            # looking for edges
            for label, target in cur_node.edges.items():
                augmentations, has_redundant_pb_role = process_augmentations(cur_augmentations)
                if has_redundant_pb_role:
                    counter_redundant_pb += 1
                if arg_number_decrease_by_one(label) in augmentations:
                    cur_verb_edge_targets.append(target)
                    cur_verb_edge_fn_roles.append(augmentations[arg_number_decrease_by_one(label)])
            

            # after looping through edges
            semlink_index += 1
            edge_targets.append(cur_verb_edge_targets)
            fn_roles.append(cur_verb_edge_fn_roles)

        node_index += 1
    
    return node_ids, fn_frames, edge_targets, fn_roles, counter_redundant_pb

In [199]:
def _eds_to_networkx_batch(edses):
    nxes = []
    for eds in edses:

        G = nx.DiGraph()
        for node in eds.nodes:
            G.add_node(node.id, label = node.predicate)
            for e, t in node.edges.items():
                G.add_edge(e, t)

        nxes.append(G)
    return nxes

In [76]:
with open('cleaned_data.pkl', 'rb') as file:
    cleaned_data = pickle.load(file)
    file.close()
with open('error_record.pkl', 'rb') as file:
    error_record = pickle.load(file)
    file.close()

In [170]:
example = cleaned_data['000003000']
example['eds'].nodes[0].properties

{'SF': 'prop',
 'TENSE': 'untensed',
 'MOOD': 'indicative',
 'PROG': '-',
 'PERF': '-'}

In [33]:
c = 0
for k, v in cleaned_data.items():
    c += 1
    print(k)
    print(v)
    if c == 5:
        break

20001001
{'sentence': 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.', 'eds': <EDS object (proper_q compound proper_q named named measure udef_q card _year_n_1 _old_a_1 _join_v_1 _the_q _board_n_of _as_p _a_q _nonexecutive/jj_u_unknown _director_n_of loc_nonsp mofy def_explicit_q of_p def_implicit_q dofm) at 140425965903280>, 'semlink': [{'id': 20001001, 'token_id': 8, 'vb_form': 'join-v', 'fn_frame': 'Cause_to_amalgamate', 'augmentations': "['0:2-ARG0=Agent;Agent', '7:0-ARGM-MOD', '8:0-rel', '9:1-ARG1=Patient;Part_1', '11:1-ARGM-PRD', '15:1-ARGM-TMP']"}], 'tree': Tree('TOP', [Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NML', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN',

In [166]:
find_node_ids_edge_targets(example['eds'], example['semlink'])

(['e23', 'e4', 'e3'],
 ['Manufacturing', 'Causation', 'Statement'],
 [['x24'], ['x8', 'x42'], ['x88', 'e4']],
 [['Product'], ['Cause', 'Effect'], ['Speaker', 'Message']],
 0)

In [87]:
list(cleaned_data.values())[4]['semlink'][0]

{'id': 20003004,
 'token_id': 4,
 'vb_form': 'report-v',
 'fn_frame': 'Statement',
 'augmentations': "['1:1*5:1-ARG1', '4:0-rel', '6:3-ARGM-TMP']"}

In [62]:
eds_missing = list(all_df[all_df['_merge'] == 'left_only'].drop_duplicates(subset=['id'])['id'])
semlink_missing = list(all_df[all_df['_merge'] == 'right_only'].drop_duplicates(subset=['id'])['id'])

# filtered_ids = [x for x in list(sentences['id']) if not x in semlink_missing]

filtered_sentences = sentences[~sentences.id.isin(semlink_missing)]

In [64]:
filtered_sentences[filtered_sentences['id']=='20114002']

Unnamed: 0,id,sentence,constituency,eds,dependency


0

In [81]:
k

20001001

In [79]:
[x['semlink'] for x in cleaned_data.values()][0]

[{'id': 20001001,
  'token_id': 8,
  'vb_form': 'join-v',
  'fn_frame': 'Cause_to_amalgamate',
  'augmentations': "['0:2-ARG0=Agent;Agent', '7:0-ARGM-MOD', '8:0-rel', '9:1-ARG1=Patient;Part_1', '11:1-ARGM-PRD', '15:1-ARGM-TMP']"}]

In [51]:
all_df = semlink_map.merge(sentences, on=['id'], how='outer', indicator=True)
semlink_missing = list(all_df[all_df['_merge'] == 'right_only'].drop_duplicates(subset=['id'])['id'])

In [58]:
sentences[~sentences.id.isin(semlink_missing)]

Unnamed: 0,id,sentence,constituency,eds,dependency
0,20001001,"Pierre Vinken, 61 years old, will join the boa...","(S\n (NP\n (N (N (NP (N (N ""pierre""))) (N (N ...",{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound...,{\n proper_q<0:28> BV named<7:14>(Vinken) \n...
2,20003001,A form of asbestos once used to make Kent ciga...,"(S\n (S\n (NP (DET ""a"")\n (N (N (N (N ""form...","{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...",{\n focus_d<0:190> ARG1 _report_v_to<181:190>...
3,20003002,"The asbestos fiber, crocidolite, is unusually ...","(S\n (S\n (S\n (S\n (NP (NP (DET ""the"") ...","{e3:\n e5:focus_d<0:176>{e SF prop, TENSE unte...",{\n focus_d<0:176> ARG1 _say_v_to<171:176> \...
4,20003003,"Lorillard Inc., the unit of New York-based Loe...","(S\n (NP (NP (N (N (N ""Lorillard"")) (N (N ""inc...","{e3:\n e7:appos<0:82>{e SF prop, TENSE untense...",{\n appos<0:82> ARG1 named<0:9>(Lorillard) \...
5,20003004,Although preliminary findings were reported mo...,"(S\n (PP (P ""although"")\n (S (NP (N (AP ""prel...","{e4:\n e4:_although_x<0:8>{e SF prop, TENSE un...",{\n _although_x<0:8> ARG1 _appear_v_1<85:91> ...
...,...,...,...,...,...
38582,22172056,But if the board rejects a reduced bid and dec...,"(S\n (S (CONJ ""but"")\n (S\n (PP (P ""if"")\n ...","{e3:\n e3:_but_c<0:3>{e SF prop, TENSE untense...",{\n _but_c<0:3> R-HNDL _if_x_then<4:6> \n _...
38583,22172057,The pilots could play hardball by noting they ...,"(S (NP (DET ""the"") (N (N (N ""pilots""))))\n (VP...",{e3:\n _1:_the_q<0:3>[BV x6]\n x6:_pilot_n_of<...,{\n _the_q<0:3> BV _pilot_n_of<4:10> \n _co...
38584,22172058,"If they were to insist on a low bid of, say $2...","(S\n (PP (P ""if"")\n (S (NP (NP ""they""))\n (...","{e4:\n e4:_if_x_then<0:2>{e SF prop, TENSE unt...",{\n _if_x_then<0:2> ARG1 _because_x<129:136> ...
38585,22172059,"Also, because UAL Chairman Stephen Wolf and ot...","(S (ADV (ADV ""also,""))\n (S\n (PP (P ""because...","{e5:\n e5:_also_a_1<0:5>{e SF prop, TENSE unte...",{\n _also_a_1<0:5> ARG1 _because_x<6:13> \n ...


In [52]:
semlink_missing

[20001002,
 20003007,
 20004004,
 20004013,
 20004016,
 20005002,
 20009002,
 20009003,
 20009004,
 20010002,
 20010003,
 20010012,
 20010013,
 20010016,
 20010020,
 20011008,
 20012015,
 20012016,
 20013004,
 20013011,
 20013016,
 20015009,
 20015011,
 20015014,
 20015022,
 20016003,
 20018013,
 20018019,
 20018025,
 20018027,
 20018033,
 20018034,
 20019003,
 20019005,
 20019007,
 20019009,
 20020012,
 20020021,
 20021003,
 20022003,
 20022004,
 20022007,
 20022009,
 20022012,
 20024002,
 20025003,
 20027003,
 20027005,
 20027007,
 20027008,
 20027012,
 20029004,
 20029005,
 20029006,
 20034005,
 20034006,
 20034009,
 20034012,
 20034014,
 20034018,
 20034021,
 20034022,
 20034034,
 20034036,
 20035008,
 20035009,
 20036003,
 20036009,
 20036015,
 20036017,
 20036030,
 20036038,
 20036050,
 20036056,
 20037006,
 20037011,
 20037015,
 20037020,
 20037022,
 20037030,
 20037035,
 20037037,
 20037061,
 20037062,
 20037063,
 20037064,
 20037065,
 20037066,
 20037067,
 20037068,
 20037070,

In [3]:
gnn_data = pd.read_csv('gnn_data.csv')
gnn_data.head()

Unnamed: 0,id,sentence,eds,semlink,target_node,fn_frame,edge_targets,fn_roles
0,1000,"Pierre Vinken, 61 years old, will join the boa...",{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound...,"{'section_id': 0, 'doc_id': 1, 'sentence_id': ...",e3,Cause_to_amalgamate,"['x6', 'x23']","['Agent', 'Part_1']"
1,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e23,Manufacturing,['x24'],['Product']
2,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e4,Causation,"['x8', 'x42']","['Cause', 'Effect']"
3,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e3,Statement,"['x88', 'e4']","['Speaker', 'Message']"
4,3001,"The asbestos fiber, crocidolite, is unusually ...","{e3:\n e5:focus_d<0:176>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e61,Causation,"['x49', 'x62']","['Cause', 'Effect']"


In [5]:
gnn_data[:100]

Unnamed: 0,id,sentence,eds,semlink,target_node,fn_frame,edge_targets,fn_roles
0,1000,"Pierre Vinken, 61 years old, will join the boa...",{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound...,"{'section_id': 0, 'doc_id': 1, 'sentence_id': ...",e3,Cause_to_amalgamate,"['x6', 'x23']","['Agent', 'Part_1']"
1,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e23,Manufacturing,['x24'],['Product']
2,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e4,Causation,"['x8', 'x42']","['Cause', 'Effect']"
3,3000,A form of asbestos once used to make Kent ciga...,"{e3:\n e5:focus_d<0:190>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e3,Statement,"['x88', 'e4']","['Speaker', 'Message']"
4,3001,"The asbestos fiber, crocidolite, is unusually ...","{e3:\n e5:focus_d<0:176>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 3, 'sentence_id': ...",e61,Causation,"['x49', 'x62']","['Cause', 'Effect']"
...,...,...,...,...,...,...,...,...
95,8001,"Until Congress acts, the government hasn","{e3:\n e5:focus_d<0:118>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 8, 'sentence_id': ...",e34,NF,['x35'],['']
96,8001,"Until Congress acts, the government hasn","{e3:\n e5:focus_d<0:118>{e SF prop, TENSE unte...","{'section_id': 0, 'doc_id': 8, 'sentence_id': ...",e3,IN,"['x57', 'e7']","['', '']"
97,8002,The government,{e3:\n _1:_the_q<0:3>[BV x6]\n x6:_government_...,"{'section_id': 0, 'doc_id': 8, 'sentence_id': ...",e3,Change_position_on_a_scale,[],[]
98,8003,Legislation to lift the debt ceiling is ensnar...,{e3:\n _1:udef_q<0:36>[BV x6]\n x6:_legislatio...,"{'section_id': 0, 'doc_id': 8, 'sentence_id': ...",e11,NF,['x12'],['']


In [7]:
len(set(gnn_data['fn_frame'].values))

321

In [193]:
gnn_data.iloc[0]['eds']

'{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound<0:14>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x6, ARG2 x9]\n _2:proper_q<0:6>[BV x9]\n x9:named<0:6>("Pierre"){x PERS 3, NUM sg, IND +}[]\n x6:named<7:14>("Vinken"){x PERS 3, NUM sg, IND +}[]\n e17:measure<15:23>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e16, ARG2 x15]\n _3:udef_q<15:23>[BV x15]\n e22:card<15:17>("61"){e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x15]\n x15:_year_n_1<18:23>{x PERS 3, NUM pl, IND +}[]\n e16:_old_a_1<24:28>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x6]\n e3:_join_v_1<34:38>{e SF prop, TENSE fut, MOOD indicative, PROG -, PERF -}[ARG1 x6, ARG2 x23]\n _4:_the_q<39:42>[BV x23]\n x23:_board_n_of<43:48>{x PERS 3, NUM sg}[]\n e29:_as_p<49:51>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e3, ARG2 x30]\n _5:_a_q<52:53>[BV x30]\n e35:_nonexecutive/jj_u_unknown<54:66>{e SF prop, TENSE untensed, MOOD indicativ

In [183]:
example

{'sentence': 'A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago, researchers reported.',
 'eds': <EDS object (focus_d _a_q _form_n_of udef_q _asbestos_n_1 _once_a_1 _use_v_1 parg_d _make_v_1 udef_q compound proper_q named compound udef_q _cigarette_n_1 _filter_n_1 _cause_v_1 _a_q _high_a_1 _percentage_n_of udef_q compound udef_q _cancer_n_1 _death_n_1 _among_p _a_q _group_n_of udef_q _worker_n_1 _expose_v_to parg_d pron pronoun_q _more+than_a_1 udef_q card _year_n_1 _ago_p udef_q _researcher_n_of _report_v_to) at 140430852331536>,
 'semlink': [{'section_id': 0,
   'doc_id': 3,
   'sentence_id': 0,
   'token_id': 9,
   'is_gold': 'gold',
   'vb_form': 'make-v',
   'vn_class_index': '26.1-1',
   'fn_frame': 'Manufacturing',
   'pb_sense': 'make.01',
   'not_sure_0': nan,
   'not_sure_1': '-----',
   'augmentations': "['7:1-ARG0=Agent;Manufacturer', '9:0-rel', '10:1-ARG1=P

In [27]:
example_eds = delphin.codecs.eds.decode(gnn_data.iloc[0]['eds'])

In [30]:
example_eds.nodes[5].edges

{'ARG1': 'e16', 'ARG2': 'x15'}

In [269]:
test_eds_small = delphin.codecs.eds.loads('\n'.join(list(gnn_data['eds'].values[:500])))

In [4]:
gnn_data_small = gnn_data[:500]

In [5]:
gnn_data_small.to_csv('./gnn/data/raw/gnn_data_small.csv', index=False)

In [274]:
example_eds

<EDS object (proper_q compound proper_q named named measure udef_q card _year_n_1 _old_a_1 _join_v_1 _the_q _board_n_of _as_p _a_q _nonexecutive/jj_u_unknown _director_n_of loc_nonsp mofy def_explicit_q of_p def_implicit_q dofm) at 140429830218432>

# Verification

In [9]:
len(error_record['eds_failure'])

23

In [8]:
len(set(error_record['eds_missing']))

8581

In [108]:
len(cleaned_data)

38587

In [38]:
c = 0

for k, v in cleaned_data.items():
    if 'eds' in v and 'semlink' in v:
        c += 1
        enhanced_eds, left_over_semlinks = enhance_eds(v['eds'], v['semlink'])
        v['enhanced_eds'] = enhanced_eds

In [39]:
enhanced = {}
for k, v in cleaned_data.items():
    if 'enhanced_eds' in v:
        original_key = '2' + k[1:3] + str(int(k[-5:]) + 1).zfill(5)
        enhanced[original_key] = delphin.codecs.eds.encode(v['enhanced_eds'])

In [69]:
with open('projected(1).json', 'r') as f:
    yours = json.load(f)

In [90]:
i_but_not_you = []
you_but_not_i = []
we_both = []
for k, v in enhanced.items():
    if k not in yours:
        i_but_not_you.append(k)

for k, v in yours.items():
    if k not in enhanced:
        you_but_not_i.append(k)
    else:
        we_both.append(k)

In [91]:
len(i_but_not_you)

7451

In [35]:
cleaned_data

{'000001000': {'sentence': 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
  'eds': <EDS object (proper_q compound proper_q named named measure udef_q card _year_n_1 _old_a_1 _join_v_1 _the_q _board_n_of _as_p _a_q _nonexecutive/jj_u_unknown _director_n_of loc_nonsp mofy def_explicit_q of_p def_implicit_q dofm) at 140432530341952>,
  'semlink': [{'section_id': 0,
    'doc_id': 1,
    'sentence_id': 0,
    'token_id': 8,
    'is_gold': 'gold',
    'vb_form': 'join-v',
    'vn_class_index': '22.1-2-1',
    'fn_frame': 'Cause_to_amalgamate',
    'pb_sense': 'join.01',
    'not_sure_0': nan,
    'not_sure_1': '-----',
    'augmentations': "['0:2-ARG0=Agent;Agent', '7:0-ARGM-MOD', '8:0-rel', '9:1-ARG1=Patient;Part_1', '11:1-ARGM-PRD', '15:1-ARGM-TMP']"}],
  'tree': Tree('TOP', [Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NML', [Tree('CD', ['61']), Tree('NNS', ['years'

In [42]:
len(you_but_not_i)

0

In [27]:
len(we_both)

0

In [44]:
i_but_not_you[0]

'20006001'

In [49]:
'20006001'[-3:]

'001'

In [54]:
original_id_to_mine('20006001')

'000006000'

In [64]:
my_id_to_original('000006000')

'20006001'

In [74]:
need_attention = []
for k in i_but_not_you:
    if original_id_to_mine(k) not in error_record['tree_missing']:
        need_attention.append(k)

In [75]:
need_attention

[]

In [76]:
sample_mine = {}
sample_yours = {}
for k in we_both[:50]:
    sample_mine[k] = enhanced[k]
    sample_yours[k] = yours[k]

In [77]:
sample_mine.keys()

dict_keys(['20001001', '20003001', '20003002', '20003003', '20003004', '20003005', '20003008', '20003009', '20003010', '20003011', '20003012', '20003013', '20003014', '20003015', '20003016', '20003017', '20003018', '20003019', '20003020', '20003021', '20003022', '20003023', '20003024', '20003025', '20003026', '20003027', '20003028', '20003029', '20003030', '20004001', '20004002', '20004005', '20004006', '20004007', '20004008', '20004009', '20004010', '20004011', '20004012', '20004014', '20004015', '20004017', '20005001', '20005003', '20007001', '20007002', '20007003', '20007004', '20008001', '20008002'])

In [59]:
k = '20003024'
sample_yours[k]

'{e3:\n e6:appos<0:109>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x5, ARG2 x4]\n _1:udef_q<13:109>[BV x4]\n _2:proper_q<0:12>[BV x5]\n e15:compound<0:12>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x5, ARG2 x14]\n _3:proper_q<0:4>[BV x14]\n x14:named<0:4>("J.P."){x PERS 3, NUM sg, IND +}[]\n x5:named<5:12>("Bolduc"){x PERS 3, NUM sg, IND +}[]\n e22:compound<13:26>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x4, ARG2 x21]\n _4:udef_q<13:17>[BV x21]\n x21:_vice_n_1<13:17>{x PERS 3, NUM sg, IND +}[]\n x4:_chairman_n_of<18:26>{x PERS 3, NUM sg, IND +}[ARG1 x27]\n _5:proper_q<30:109>[BV x27]\n e33:_elect_v_1-fn.Change_of_leadership<30:40>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG3-fn.New_leader x5, ARG2-fn.Role x76]\n _6:proper_q<30:34>[BV x32]\n x32:named<30:34>("W.R."){x PERS 3, NUM sg, IND +}[]\n x27:named<35:40>("Grace"){x PERS 3, NUM sg, IND +}[]\n x39:_and+company_n_1<41:47>{x IND +}[]\n _7:udef_q<4

In [67]:
print(cleaned_data['000003000']['tree'])

(TOP
  (S
    (S-TPC-1
      (NP-SBJ
        (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos))))
        (VP
          (ADVP-TMP (RB once))
          (VBN used)
          (NP (-NONE- *))
          (S-CLR
            (NP-SBJ (-NONE- *PRO*))
            (VP
              (TO to)
              (VP
                (VB make)
                (NP (NNP Kent) (NN cigarette) (NNS filters)))))))
      (VP
        (VBZ has)
        (VP
          (VBN caused)
          (NP
            (NP (DT a) (JJ high) (NN percentage))
            (PP (IN of) (NP (NN cancer) (NNS deaths)))
            (PP-LOC
              (IN among)
              (NP
                (NP (DT a) (NN group))
                (PP
                  (IN of)
                  (NP
                    (NP (NNS workers))
                    (VP
                      (VBN exposed)
                      (NP (-NONE- *))
                      (PP-CLR (IN to) (NP (PRP it)))
                      (ADVP-TMP
                        (NP
   

In [99]:
diff = []
same = []
for k in we_both:
    if not enhanced[k] == yours[k]:
        diff.append(k)
    else:
        same.append(k)

In [96]:
diff

['20003001',
 '20003003',
 '20003004',
 '20003013',
 '20004006',
 '20004007',
 '20004009',
 '20004017',
 '20005001',
 '20005003',
 '20008003',
 '20010001',
 '20010008',
 '20010018',
 '20010019',
 '20011004',
 '20011007',
 '20012004',
 '20012005',
 '20012006',
 '20012007',
 '20012008',
 '20012010',
 '20016001',
 '20016002',
 '20016006',
 '20017002',
 '20020001',
 '20020004',
 '20020006',
 '20020007',
 '20020008',
 '20021008',
 '20022014',
 '20022017',
 '20023003',
 '20023004',
 '20024006',
 '20025001',
 '20025002',
 '20029008',
 '20029009',
 '20029013',
 '20033001',
 '20033004',
 '20034001',
 '20034015',
 '20034023',
 '20034027',
 '20035003',
 '20035010',
 '20036001',
 '20036004',
 '20036006',
 '20036007',
 '20036010',
 '20036011',
 '20036016',
 '20036020',
 '20036026',
 '20036028',
 '20036029',
 '20036031',
 '20036032',
 '20036036',
 '20036044',
 '20036047',
 '20036049',
 '20036052',
 '20036053',
 '20036058',
 '20037016',
 '20037026',
 '20037045',
 '20037059',
 '20038003',
 '20039005',

In [100]:
len(same)

18118

In [98]:
ori_id = '20036047'
print('-----------mine----------')
print(enhanced[ori_id])
print('-----------yours----------')
print(yours[ori_id])
print('-----------original EDS----------')
print(find_eds_by_ids_df(0,36,46,sentences))
print('-----------semlink------------')
pprint(cleaned_data[original_id_to_mine(ori_id)]['semlink'])

-----------mine----------
{e3:
 e3:_and_c<0:3>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[R-HNDL e41, R-INDEX e4]
 _1:udef_q<4:6>[BV x10]
 e13:card<4:5>("8"){e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x10]
 x10:_percent_n_of<5:6>{x PERS 3, NUM pl, IND +}[]
 e4:_say_v_to-fn.IN<7:11>{e SF prop, TENSE past, MOOD indicative, PROG -, PERF -}[ARG1-fn.13:1-ARG0=Agent x10, ARG2-fn.13:1-ARG1=Topic e30]
 _2:udef_q<12:25>[BV x19]
 e23:compound<12:25>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x19, ARG2 x22]
 _3:udef_q<12:18>[BV x22]
 x22:_export_n_of<12:18>{x}[]
 x19:_order_n_of<19:25>{x PERS 3, NUM pl}[]
 e30:_down_p<31:35>{e SF prop, TENSE past, MOOD indicative, PROG -, PERF -}[ARG1 x19]
 e31:loc_nonsp<36:47>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e30, ARG2 x32]
 _4:def_implicit_q<36:40>[BV x32]
 i37:_last_a_1<36:40>{i}[ARG1 x32]
 x32:_month_n_1<41:47>{x PERS 3, NUM sg, IND +}[]
 e41:subord<48:82>{e SF prop, 

In [85]:
semlink_map[semlink_map['fn_frame'] == 'NF']

Unnamed: 0,section_id,doc_id,sentence_id,token_id,is_gold,vb_form,vn_class_index,fn_frame,pb_sense,not_sure_0,not_sure_1,augmentations
37,0,3,17,6,gold,own-v,100,NF,own.01,,-----,"['0:1*3:1*4:1*7:1-ARG1=Theme', '6:0-rel', '8:1..."
44,0,3,21,7,gold,impose-v,63,NF,impose.01,,-----,"['0:1-ARGM-TMP', '3:1-ARG0=Agent', '7:0-rel', ..."
49,0,3,25,1,gold,dump-v,9.3-1-1,NF,dump.01,1.0,-----,"['0:1-ARG0=Agent', '1:0-rel', '2:2-ARG1=Theme'..."
61,0,4,1,12,gold,track-v,35.3,NF,track.01,2.0,-----,"['8:1*13:1-ARG1=Theme', '12:0-rel', '14:1-ARG0..."
71,0,4,5,14,gold,capture-v,10.5-1,NF,capture.01,3.0,-----,"['11:1-ARG0=Agent', '13:0-ARGM-MOD', '14:0-rel..."
...,...,...,...,...,...,...,...,...,...,...,...,...
74953,24,54,23,15,gold,oppose-v,22.2-3,NF,oppose.01,1.0,-----,"['8:2*13:1*14:1-ARG0=Patient', '15:0-rel', '16..."
74962,24,54,27,27,gold,free-v,80-1,NF,free.01,1.0,-----,"['26:0-ARGM-TMP', '27:0-rel', '28:0-ARG1=Source']"
74964,24,54,28,24,gold,instruct-v,37.9-1,NF,instruct.01,1.0,-----,"['0:1-ARGM-PNC', '9:1-ARG0=Agent', '12:1-ARGM-..."
74966,24,54,29,9,gold,bridge-v,47.8,NF,bridge.01,,-----,"['0:1*8:1-ARG0', '9:0-rel', '10:2-ARG1=Theme',..."


In [88]:
enhanced

{'20001001': '{e3:\n _1:proper_q<0:28>[BV x6]\n e10:compound<0:14>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x6, ARG2 x9]\n _2:proper_q<0:6>[BV x9]\n x9:named<0:6>("Pierre"){x PERS 3, NUM sg, IND +}[]\n x6:named<7:14>("Vinken"){x PERS 3, NUM sg, IND +}[]\n e17:measure<15:23>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e16, ARG2 x15]\n _3:udef_q<15:23>[BV x15]\n e22:card<15:17>("61"){e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x15]\n x15:_year_n_1<18:23>{x PERS 3, NUM pl, IND +}[]\n e16:_old_a_1<24:28>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x6]\n e3:_join_v_1-fn.Cause_to_amalgamate<34:38>{e SF prop, TENSE fut, MOOD indicative, PROG -, PERF -}[ARG1-fn.Agent x6, ARG2-fn.Part_1 x23]\n _4:_the_q<39:42>[BV x23]\n x23:_board_n_of<43:48>{x PERS 3, NUM sg}[]\n e29:_as_p<49:51>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e3, ARG2 x30]\n _5:_a_q<52:53>[BV x30]\n e35:_nonexecutive/jj_u_u

In [105]:
with open('same_eds_ids.txt', 'w') as f:
    for s in same:
        f.writelines(s)
        f.writelines(['\n'])
    f.close()

In [109]:
with open('projected_out.json', 'r') as f:
    old_enhance = json.load(f)
    f.close()
with open('projected_out_new.json', 'r') as f:
    new_enhance = json.load(f)
    f.close()

In [110]:
diff = []
for k, v in new_enhance.items():
    if v != old_enhance[k]:
        diff.append(k)

In [112]:
len(diff)

559

In [111]:
diff[:5]

['20003023', '20004005', '20006002', '20012008', '20013015']

In [123]:
idd = '20013015'

In [124]:
new_enhance[idd]

'{e3:\n e5:focus_d<0:44>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e3, ARG2 e4]\n e8:focus_d<0:34>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4, ARG2 e7]\n e7:loc_nonsp<0:4>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4, ARG2 x9]\n x9:time_n<0:4>{x PERS 3, NUM sg}[]\n _1:def_implicit_q<0:4>[BV x9]\n e14:_now_a_1<0:4>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x9]\n _2:_the_q<5:8>[BV x17]\n x17:named<9:14>("Field"){x PERS 3, NUM sg, IND +}[]\n e20:comp_less<18:22>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4]\n e4:_cluttered_a_1<23:34>{e SF prop, TENSE pres, MOOD indicative, PROG -, PERF -}[ARG1 x17]\n x23:pron<35:37>{x PERS 3, NUM sg, GEND m, PT std}[]\n _3:pronoun_q<35:37>[BV x23]\n e3:_add_v_to-fn.NF<38:44>{e SF prop, TENSE past, MOOD indicative, PROG -, PERF -}[ARG1 x23, ARG2 e4]\n}'

In [125]:
old_enhance[idd]

'{e3:\n e5:focus_d<0:44>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e3, ARG2 e4]\n e8:focus_d<0:34>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4, ARG2 e7]\n e7:loc_nonsp<0:4>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4, ARG2 x9]\n x9:time_n<0:4>{x PERS 3, NUM sg}[]\n _1:def_implicit_q<0:4>[BV x9]\n e14:_now_a_1<0:4>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 x9]\n _2:_the_q<5:8>[BV x17]\n x17:named<9:14>("Field"){x PERS 3, NUM sg, IND +}[]\n e20:comp_less<18:22>{e SF prop, TENSE untensed, MOOD indicative, PROG -, PERF -}[ARG1 e4]\n e4:_cluttered_a_1<23:34>{e SF prop, TENSE pres, MOOD indicative, PROG -, PERF -}[ARG1 x17]\n x23:pron<35:37>{x PERS 3, NUM sg, GEND m, PT std}[]\n _3:pronoun_q<35:37>[BV x23]\n e3:_add_v_to<38:44>{e SF prop, TENSE past, MOOD indicative, PROG -, PERF -}[ARG1 x23, ARG2 e4]\n}'

In [126]:
cleaned_data[int(idd)]['semlink']

[{'id': 20013015,
  'token_id': 6,
  'vb_form': 'clutter-v',
  'fn_frame': 'NF',
  'augmentations': "['1:1-ARGM-TMP', '2:1-ARG1=Destination', '5:0-ARGM-EXT', '6:0-rel']"},
 {'id': 20013015,
  'token_id': 10,
  'vb_form': 'add-v',
  'fn_frame': 'NF',
  'augmentations': "['9:1-ARG0', '10:0-rel', '1:2*11:0-ARG1']"}]