forked from jac2130/semaphore-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
semaphore.py~
62 lines (44 loc) · 3.15 KB
/
semaphore.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re, sys, os
#getting the directories which are set by the user in the config file (located in the release directory).
sys.path.append('/home/johannes/Documents/causal-belief-catcher/Semaphore')
from directories import *
def run_semaphore(semaphore=release, command='./fnParserDriver.sh', sample='../samples/sample.txt', output= '../samples/output.txt'):
#"./fnParserDriver.sh" is an unwieldy bash script that I'm hoping to replace entirely with python code soon, in order to have fine
#control of the behavior of semaphore from python. But this is still relatively low on my list of priorities.
os.chdir(semaphore)
os.system(command + ' ' + sample + " " + output)
def import_semaphore(xml=output):
'''
Takes the xml output that results from running the Semaphore program and returns a python object.
'''
import xmltodict
f=open(xml, 'r').read()
raw_dict=xmltodict.parse(f)
#cutting the initial layers of an unwieldy xml dictionary with far too many xml tags, as we are assuming a list of sentences:
raw_list=raw_dict[u'corpus'][u'documents'][u'document'][u'paragraphs'][u'paragraph'][u'sentences'][u'sentence']
raw_text=[str(raw_list[i][u'text']) for i in range(len(raw_list))]
raw_list=[raw_list[i][u'annotationSets'][u'annotationSet'] for i in range(len(raw_list))] #cleaning it up further
def get_frame_names(list_or_dict, keys=['@frameName']):
#This is a function that deals with the uncertainty of whether we are getting a list or dictionary here
try:
return [str(list_or_dict[j][key]) for j in range(len(list_or_dict)) for key in keys] #in case that it is a list
except KeyError:
try:
return [str(list_or_dict[key]) for key in keys] #in case that it is a dictionary
except:
return [[str(list_or_dict[j][r][key]) for r in range(len(list_or_dict[j])) if raw_list[i][j][r][u'labels']!=None ] for j in range(len(list_or_dict)) for key in keys]
frames=[get_frame_names(raw_list[i]) for i in range(len(raw_list))]
layers=[eval(dictionary)['layer'] for dictionary in get_frame_names(raw_list[i], keys=['layers']) for i in range(len(raw_list))]
labels=[[eval(dictionary)['label'] for dictionary in get_frame_names(layers[i], keys=['labels'])] for i in range(len(raw_list))]
label_list=[[] for i in range(len(labels))]
for i in range(len(labels)):
for j in range(len(labels[i])):
for l in range(len(labels[i][j])):
try:
if type(labels[i][j][l])==list:
label_list[i].append([frames[i][j]] + [[labels[i][j][l][r][u'@name'], raw_text[i][eval(labels[i][j][l][r][u'@start']): eval(labels[i][j][l][r][u'@end'])+1]] for r in range(len(labels[i][j][l]))])
else:
label_list[i].append([frames[i][j]] + [labels[i][j][l][u'@name'], raw_text[i][eval(labels[i][j][l][u'@start']): eval(labels[i][j][l][u'@end'])+1]])
except:
label_list[i].append([frames[i][j]] + [labels[i][j][u'@name'], raw_text[i][eval(labels[i][j][u'@start']): eval(labels[i][j][u'@end'])+1]])
return label_list