forked from Klim314/Quetzalcoatl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
iob2tree.py
153 lines (126 loc) · 3.51 KB
/
iob2tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/env/bin python3
import nltk
import pexpect
import os
"""
iob2tree:
Takes a file containing a presplit paper (sent_tokenize.sentSplit <in abcheck>), chunks it using geniatagger and imports it as an nltk Tree
input: File containing a sentences split by newlines
output: a list of tree objects, one tree for each sentence in the abstract
"""
"""
sProc: preprocesses a given chunked sentence, coverting it into a nltk readable format
input: GENIAtagger chunked sentence
output: nltk IOB list
"""
def sProc(sentence):
splat = [i for i in sentence.split('\n')]
res = []
#handle punctuation
punct = [".", ',']
#obtain the word, the POS tag and the chunk tag
for i in splat:
temp = i.split('\t')
print(temp)
if temp == ['']:
continue
if temp[0] in punct:
res.append([temp[0], temp[-2], temp[-1]])
else:
res.append([temp[0], temp[2], temp[3]])
return "\n".join([' '.join(i) for i in res])
"""
abProc:
Applies sProc to all sentences in an abstract
"""
def abProc(senLst):
result = [sProc(i) for i in senLst]
return result
"""
chunk:
Takes in a file containing a single abstract and chunks it using the
GENIA tagger.
Input: File containing a single abstract, abstract must be broken into single
sentence lines
Note: GeniaTagger only runs from it's own directory thus the cwds.
"""
def chunk(targetFile):
print("-----------")
#escape the spaces in filenames
targetFile = targetFile.replace(" ", "\ ")
oriDir = os.getcwd()
#set this directory to the geniatagger directory
#geniatagger is required to be run from it's directory and cannot be accessed directly
os.chdir("extparsers/geniatagger-3.0.1")
child = pexpect.spawn("./geniatagger " +"../../" + targetFile)
child = [i for i in child]
print(child)
results = [i.decode("utf-8") for i in child[4:]]
os.chdir(oriDir)
return results
"""
senSplit:
Takes chunked abstract, returns list of chunked sentences, one index per setence
"""
def senSplit(chunkLst):
holder = ""
result = []
chunkLst = [i.strip() + '\n' for i in chunkLst]
for i in chunkLst:
if i == "\n":
if holder != '':
result.append(holder)
holder = ''
continue
else:
holder += i
if holder != "":
result.append(holder)
return result
"""
execute:
Overall execution function. Collects all functions above.
Input: File containing a single abstract, abstract must be broken into single
sentence lines
Output: List of nltk trees comprising the entire abstract
"""
def execute(targetFile):
chunks = chunk(targetFile)
print(chunks)
#process chunked files for import
pChunks = abProc(senSplit(chunks))
print(pChunks)
mktree = nltk.chunk.conllstr2tree
return [mktree(i) for i in pChunks]
if __name__ == "__main__":
# outdir = "../output/iob2tree/"
# if not os.path.exists(outdir):
# os.mkdir(outdir)
indir = "output/abcheck/"
inp = indir + "lactobacillus acidophilus#escherichia coli/53.out"
print(inp)
# with open(inp) as f:
# for i in f:
# print(i)
[i.draw() for i in execute(inp)]
# blah = execute(inp)
# blah[0].draw()
# with open("../input/chunktest.in") as f:
# holder = ''
# res = []
# for i in f:
# if i == '\n':
# res.append(holder)
# holder = ''
# continue
# holder += i
# chunks = chunk("../../input/testsentences.in")
# pchunks = abProc(senSplit(chunks))
# print(pchunks)
# mktree = nltk.chunk.conllstr2tree
# mktree(pchunks[0]).draw()
# print(chunk("../../input/testsentences.in"))
# tester = abProc(res)
# chunktest = sProc(chunk("../../input/testsentences.in")[0])
# print(chunktest)
# nltk.chunk.conllstr2tree(chunktest).draw()*