-
Notifications
You must be signed in to change notification settings - Fork 0
/
math_extractor.py
233 lines (181 loc) · 8.46 KB
/
math_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import sys
import re
import string
import io
import xml
from bs4 import BeautifulSoup
from tangent.math.mathsymbol import MathSymbol
from tangent.math.symboltree import SymbolTree
from tangent.math.latex_mml import LatexToMathML
from tangent.math.exceptions import UnknownTagException
__author__ = 'Nidhin, FWTompa'
## TODO: produce cleaned_file_content for text indexing on a separate pass (called separately in Version 0.2)
## simplify math extraction by creating simple list of math expressions and then grouping them by SLT, rather than by LaTeX
class MathExtractor:
def __init__(self):
pass
namespace = r"(?:[^> :]*:)?"
attributes = r"(?: [^>]*)?"
math_expr = "<"+namespace+"math"+attributes+r">.*?</"+namespace+"math>"
dollars = r"(?<!\\)\$+"
latex_expr = dollars+".{1,200}?"+dollars # converted to math_expr in cleaned text
# latex could also be surrounded by \(..\) or \[..\], but these are ignored for now (FWT)
text_token = r"[^<\s]+"
math_pattern = re.compile(math_expr, re.DOTALL) # TODO: allow for LaTeX as well
# split_pattern = re.compile(math_expr+"|"+latex_expr+"|"+text_token, re.DOTALL)
inner_math = re.compile(".*(<"+math_expr+")", re.DOTALL) # rightmost <*:math
open_tag = re.compile("<(?!/)(?!mws:qvar)"+namespace, re.DOTALL) # up to and including namespace
close_tag = re.compile("</(?!mws:qvar)"+namespace, re.DOTALL) # but keep qvar namespace
## @classmethod
## def get_string_tokenized(cls, content):
## return cls.split_pattern.findall(content)
@classmethod
def math_tokens(cls, content):
"""
extract Math expressions from XML (incl. HTML) file
param content: XML document
type content: string
return: embedded math expressions
rtype: list(string) where each string is a MathML expr
"""
tokens = cls.math_pattern.findall(content)
math = []
for token in tokens:
# print("Token = "+token,flush=True)
if token.endswith("math>"): # MathML token
## # Does not handle the case where one math expression is nested inside another
## # (likely with different namespaces)
## # N.B. Removing this check speeds up processing significantly (FWT)
## token = cls.inner_math.sub(r"\0",token) # find innermost <*:math
token = cls.close_tag.sub("</",token) # drop namespaces (FWT)
token = cls.open_tag.sub("<",token)
math.append(token)
else: # LaTeX math expression
tex = token.strip("$") # TODO: handle other latex delimiters
math.append(LatexToMathML.convert_to_mathml(tex))
return math
@classmethod
def isolate_pmml(cls,tree):
"""
extract the Presentation MathML from a MathML expr
param tree: MathML expression
type tree: string
return: Presentation MathML
rtype: string
"""
parsed_xml=BeautifulSoup(tree,"lxml")
math_root=parsed_xml.find("math") # namespaces have been removed (FWT)
## altext=math_root.get("alttext")
application_tex= math_root.find("annotation",{"encoding":"application/x-tex"})
#print("M: %s, A: %s, AA: %s" % (parsed_xml,altext,application_tex))
if application_tex:
## application_tex_text=application_tex.text
application_tex.decompose()
## latex=altext if altext else application_tex_text
pmml_markup=math_root.find("annotation-xml",{"encoding":"MathML-Presentation"})
if pmml_markup:
pmml_markup.name = "math"
else:
pmml_markup=math_root
cmml_markup=math_root.find("annotation-xml",{"encoding":"MathML-Content"})
if cmml_markup:
cmml_markup.decompose() # delete any Content MML
pmml_markup['xmlns']="http://www.w3.org/1998/Math/MathML" # set the default namespace
return str(pmml_markup)
@classmethod
def convert_to_mathsymbol(cls, elem):
"""
Parse expression from MathML
:param elem: mathml
:type elem: string
:rtype MathSymbol or None
:return root of symbol tree
"""
if (len(elem) == 0):
return None
elem_content = io.StringIO(elem) # treat the string as if a file
root = xml.etree.ElementTree.parse(elem_content).getroot()
## print("parse_from_mathml tree: " + xml.etree.ElementTree.tostring(root,encoding="unicode"))
return MathSymbol.parse_from_mathml(root)
@classmethod
def convert_and_link_mathml(cls, elem, document=None, position=None):
"""
Parse expression from MathML keeping the links to the original MathML for visualization purposes
:param elem: mathml
:type elem: string
:rtype SymbolTree or None
:return Symbol tree instance
"""
if (len(elem) == 0):
return None
elem_content = io.StringIO(elem) # treat the string as if a file
root = xml.etree.ElementTree.parse(elem_content).getroot()
## print("parse_from_mathml tree: " + xml.etree.ElementTree.tostring(root,encoding="unicode"))
symbol_root = MathSymbol.parse_from_mathml(root)
return SymbolTree(symbol_root, document, position, root)
@classmethod
def parse_from_tex(cls, tex, file_id=-1, position=[0]):
"""
Parse expression from Tex string using latexmlmath to convert to presentation markup language
:param tex: tex string
:type tex string
:param file_id: file identifier
:type file_id: int
:rtype SymbolTree
:return equivalent SymbolTree
"""
#print("Parsing tex doc %s" % file_id,flush=True)
mathml=LatexToMathML.convert_to_mathml(tex)
pmml = cls.isolate_pmml(mathml)
## print('LaTeX converted to MathML: \n' )
return SymbolTree(cls.convert_to_mathsymbol(pmml),file_id,position)
@classmethod
def parse_from_xml(cls, content, content_id, missing_tags=None, problem_files=None):
"""
Parse expressions from XML file
:param content: XML content to be parsed
:type content: string
:param content_id: fileid for indexing or querynum for querying
:type content_id: int
:param missing_tags: dictionary to collect tag errors
:type missing_tags: dictionary(tag->set(content_id))
:param problem_files: dictionary to collect parsing errors
:type problem_files: dictionary(str->set(content_id))
:rtype list(SymbolTree)
:return list of Symbol trees found in content identified by content_id
"""
idx = -1
try:
trees = cls.math_tokens(content)
groupBySLT = {}
for idx, tree in enumerate(trees):
#print("Parsing doc %s, expr %i" % (content_id,idx),flush=True)
pmml = cls.isolate_pmml(tree)
symbol_tree = cls.convert_to_mathsymbol(pmml)
if symbol_tree:
s = symbol_tree.tostring()
if s not in groupBySLT:
groupBySLT[s] = SymbolTree(symbol_tree,content_id,[idx])
else:
groupBySLT[s].position.append(idx)
return(list(groupBySLT.values()))
except UnknownTagException as e:
print("Unknown tag in file or query "+str(content_id)+": "+e.tag, file=sys.stderr)
missing_tags[e.tag] = missing_tags.get(e.tag, set())
missing_tags[e.tag].add([content_id,idx])
except Exception as err:
reason = str(err)
print("Parse error in file or query "+str(content_id)+": "+reason+": "+str(tree), file=sys.stderr)
raise Exception(reason) # pass on the exception to identify the document or query
## @classmethod
## def convert_to_presentation_mml(cls, tex):
## url = 'http://halifax.cs.rit.edu:8324/'
## data = pickle.dumps(tex)
## r = req.get(url, data=data)
## # load the numpy array
## try:
## output = pickle.loads(r.text)
## output = output.decode('utf-8')
## return output
## except Exception as e:
## z = 9