In [1]:
from delphin.codecs import eds
from delphin.eds import *
import torch
import json
import sys, os
from utils import *
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from prepare_data import *
import joblib
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_encoder_model = model

In [2]:
fn_frame2id = {}
fn_role2id = {}

with open('fn_frames.json') as f:
    fn_frames = json.load(f)
counter = 0
for k in fn_frames.keys():
    fn_frame2id[k.lower()] = counter
    counter += 1

with open('fn_roles.json') as f:
    fn_roles = json.load(f)
counter = 0
for k in fn_roles.keys():
    fn_role2id[k.lower()] = counter
    counter += 1
fn_role2id['****'] = counter #for no assigned role

with open('fn_frame2id.json','w') as f:
    f.write(json.dumps(fn_frame2id, indent=2))
    f.close()

with open('fn_role2id.json','w') as f:
    f.write(json.dumps(fn_frame2id, indent=2))
    f.close()

In [4]:
node_classification_data = []
edge_classification_data = []

with open('filtered_data.json', 'r') as f:
    data = json.load(f)

with open('features_config.json', 'r') as f:
    feature_dict = json.load(f)

for k in tqdm(data.keys(), desc='data'):
    s, _, _ = get_from_file('deepbank_raw/' + k)
    current_eds = loads(data[k])[0]
    for node in current_eds.nodes:
        if '-fn.' in node.predicate: 
            target_fn_frame = node.predicate.split('-fn.')[-1]
            verb_data = {'sentence':s, 'eds':current_eds, 'verb_id':node.id, 'target_fn_frame':target_fn_frame.lower()}
            node_classification_data.append(verb_data)

            for key in node.edges:
                if not '-fn.' in key:
                    target_fn_role = '****'
                else:
                    target_fn_role = key.split('-fn.')[-1]
                edge_data = {'sentence':s, 'eds':current_eds, 'start':node.id, 'end':node.edges[key], 'target_fn_role':target_fn_role.lower()}
                edge_classification_data.append(edge_data)
with open('verb_data.pkl','wb') as f:
    joblib.dump(node_classification_data, f)

with open('edge_data.pkl','wb') as f:
    joblib.dump(edge_classification_data, f)


data: 100%|██████████| 18118/18118 [01:26<00:00, 209.68it/s]


In [None]:

max_length = 50

In [None]:
t = model.encode('Iloveyou')
t = torch.from_numpy(t)

In [None]:
t

In [None]:
torch.cat((t, torch.tensor([1,2,3])), 0)

In [None]:
ts, tet, te = get_from_file('deepbank_raw/20003001')

In [None]:
te.nodes[0].properties

In [None]:
def eds2geograph(eds:EDS):
    nodes2idx = {}
    counter = 0

    nodes = []
    for n in eds.nodes:
        nodes2idx[n.id] = counter
        counter += 1
        nodes.append(generate_feature(n))

    edges = []
    for n in eds.nodes:
        for key in n.edges:
            edge = [nodes2idx[n.id], nodes2idx[n.edges[key]]]
            edges.append(edge)
    
    return nodes, edges

In [None]:
names = [line.rstrip() for line in open('same_eds_ids.txt','r')]
type_dict = {}
property_dict = {}
carg = []
predicate = {}

In [None]:
for n in tqdm(names, desc='files'):
    sent, text, eds = get_from_file('deepbank_raw/' + n)
    for node in eds.nodes:
        if node.type in type_dict.keys():
            type_dict[node.type] +=1
        else:
            type_dict[node.type] = 1
        for key in node.properties:
            property_dict[key] = {}
        carg.append(node.carg)
        if node.predicate in predicate.keys():
            predicate[node.predicate] +=1
        else:
            predicate[node.predicate] = 1

In [None]:
for n in tqdm(names, desc='files'):
    sent, text, eds = get_from_file('deepbank_raw/' + n)
    for node in eds.nodes:
        for key in node.properties.keys():
            val = node.properties[key]
            if val in property_dict[key].keys():
                property_dict[key][val] += 1
            else:
                property_dict[key][val] = 1


In [None]:
print(property_dict)

In [None]:
predicate_dict ={'_n_of':847, '_v_':3186, '_n_':3973, 'proper_q':0, 'compound':0, 'measure':0,'x_deg':0,'vb_':0,'_p':0,'x_subord':0, 'udef_q':0, 'loc_nonsp':0, '_a_':0, 'vbg_':0, 'vbp_':0, 'vbd_':0,'vbn_':0,'named':0, 'pronoun_q':0, 'jj_':0, 'nn_':0, 'nns_':0}

In [None]:
for x in predicate_dict.keys():
    for key in [x for x in predicate.keys()]:
        if x in key:
            predicate.pop(key)
len(predicate)

In [None]:
from prepare_data import *

In [None]:
features = {}
features['type'] = [x for x in type_dict.keys()]
features['property'] = property_dict
features['predicate'] = [x for x in predicate_dict.keys()] + [x for x in predicate.keys()]

In [None]:
with open('features_config.json','w') as f:
    f.write(json.dumps(features, indent=2))
    f.close

In [None]:
import graphviz
def eds2graph(eds:EDS):
    g = graphviz.Digraph()
    for node in eds.nodes:
        if '-fn.' in node.predicate:
            g.node(node.id, label=node.predicate, color = 'red', fontcolor = 'red')
        else:
            g.node(node.id, label=node.predicate)
        for key in node.edges:
            if '-FN.' in key:
                g.edge(node.id, node.edges[key], label=key, color ='blue', fontcolor = 'blue')
            else:
                g.edge(node.id, node.edges[key], label=key)
        
    return g

In [None]:
eds2graph(te)

In [None]:
sem1 = 'nw/wsj/00/wsj_0002.parse 0 16 gold name-v 29.3 IN name.01 null ----- 0:2*17:1-ARG1=Theme 16:0-rel 18:2-ARG2=Result' 
sem2 = 'nw/wsj/00/wsj_0003.parse 0 9 gold make-v 26.1-1 Manufacturing make.01 null ----- 7:1-ARG0=Agent;Manufacturer 9:0-rel 10:1-ARG1=Product;Product' 
sem3 = 'nw/wsj/00/wsj_0003.parse 0 14 gold cause-v 27 Causation cause.01 1 ----- 0:3-ARG0=Cause;Cause 14:0-rel 15:2-ARG1=Theme;Effect' 


semlinks_dict = {'20003001':[SemLinkAnnotation(sem1), SemLinkAnnotation(sem2), SemLinkAnnotation(sem3)]}

In [None]:
from batch_projection import *

In [None]:
deep = sentence_eds('deepbank_raw/20003001')

In [None]:
deep.eds_text.split('\n')

In [None]:
update_graph(deep.sentence, deep.eds, SemLinkAnnotation(sem3), get_verb_nodes(deep.eds))

In [None]:
semlinks = [line.rstrip() for line in open('1.2.2c.okay.txt','r')]

In [None]:
from annotation import *
semlinks = [SemLinkAnnotation(x) for x in semlinks]

In [None]:
frame_set = {}

In [None]:
for x in semlinks:
    if not (x.fn_frame in frame_set.keys()):
        frame_set[x.fn_frame] = 1
    else:
        frame_set[x.fn_frame] += 1

In [None]:
len(frame_set)

In [None]:
print(frame_set)

In [None]:
role_set = {}

In [None]:
for x in semlinks:
    target_dependencies = [y for y in x.dependencies if ';' in y.split('-')[-1]]
    for dep in target_dependencies:
        fn_role = dep.split(';')[-1]
        if not(fn_role) in role_set.keys():
            role_set[fn_role] = 1
        else:
            role_set[fn_role] += 1

In [None]:
len(role_set)

In [None]:
print(role_set)

In [None]:
role_set['New_leader']

In [None]:
with open('fn_frames.json','w') as f:
    f.write(json.dumps(frame_set, indent=1))

In [None]:
names = [line.rstrip() for line in open('same_eds_ids.txt','r')]

In [None]:
with open('deepbank_projected/projected.json','r') as f:
    data = json.load(f)

In [None]:
same_data = {}
for x in names:
    same_data[x] = data[x]
with open('filtered_data.json','w') as f:
    f.write(json.dumps(same_data, indent=1))

In [None]:
te.nodes