/
XmlFile.java
337 lines (313 loc) · 12.3 KB
/
XmlFile.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/*
* Copyright (C) 2015 - present by OpenGamma Inc. and the OpenGamma group of companies
*
* Please see distribution for license.
*/
package com.opengamma.strata.collect.io;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.util.HashMap;
import java.util.Map;
import java.util.function.ToIntFunction;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.ByteSource;
import com.opengamma.strata.collect.ArgChecker;
import com.opengamma.strata.collect.Unchecked;
/**
* An XML file.
* <p>
* Represents an XML file together with the ability to parse it from a {@link ByteSource}.
* <p>
* This uses the standard StAX API to parse the file.
* Once parsed, the XML is represented as a DOM-like structure, see {@link XmlElement}.
* This approach is suitable for XML files where the size of the parsed XML file is
* known to be manageable in memory.
* <p>
* Note that the {@link XmlElement} representation does not express all XML features.
* No support is provided for processing instructions, comments or mixed content.
* In addition, it is not possible to determine the difference between empty content and no children.
* <p>
* There is no support for namespaces.
* All namespace prefixes are dropped.
* There are cases where this can be a problem, but most of the time lenient parsing is helpful.
*/
public final class XmlFile {
/**
* The root element.
*/
private final XmlElement root;
/**
* The map of references.
*/
private final ImmutableMap<String, XmlElement> refs;
//-----------------------------------------------------------------------
/**
* Parses the specified source as an XML file to an in-memory DOM-like structure.
* <p>
* This parses the specified byte source expecting an XML file format.
* The resulting instance can be queried for the root element.
*
* @param source the XML source data
* @return the parsed file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static XmlFile of(ByteSource source) {
return of(source, "");
}
/**
* Parses the specified source as an XML file to an in-memory DOM-like structure.
* <p>
* This parses the specified byte source expecting an XML file format.
* The resulting instance can be queried for the root element.
* <p>
* This supports capturing attribute references, such as an id/href pair.
* Wherever the parser finds an attribute with the specified name, the element is added
* to the internal map, accessible by calling {@link #getReferences()}.
* <p>
* For example, if one part of the XML has {@code <foo id="fooId">}, the references map will
* contain an entry mapping "fooId" to the parsed element {@code <foo>}.
*
* @param source the XML source data
* @param refAttrName the attribute name that should be parsed as a reference
* @return the parsed file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static XmlFile of(ByteSource source, String refAttrName) {
ArgChecker.notNull(source, "source");
return Unchecked.wrap(() -> {
try (InputStream in = source.openBufferedStream()) {
XMLStreamReader xmlReader = xmlInputFactory().createXMLStreamReader(in);
try {
HashMap<String, XmlElement> refs = new HashMap<>();
XmlElement root = parse(xmlReader, refAttrName, refs);
return new XmlFile(root, refs);
} finally {
xmlReader.close();
}
}
});
}
/**
* Parses the element names and structure from the specified XML, filtering to reduce memory usage.
* <p>
* This parses the specified byte source expecting an XML file format.
* The filter function takes the element name and decides how many child levels should be returned in the response.
* Always returning {@code Integer.MAX_VALUE} will not filter the children.
* For example, a function could check if the name is "trade" and return only the immediate children by returning 1.
*
* @param source the XML source data
* @param filterFn the filter function to use
* @return the parsed file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static XmlElement parseElements(ByteSource source, ToIntFunction<String> filterFn) {
ArgChecker.notNull(source, "source");
ArgChecker.notNull(filterFn, "filterFn");
ToIntFunction<String> safeFilterFn = name -> Math.max(filterFn.applyAsInt(name), 0);
return Unchecked.wrap(() -> {
try (InputStream in = source.openBufferedStream()) {
XMLStreamReader xmlReader = xmlInputFactory().createXMLStreamReader(in);
try {
return parseElements(xmlReader, safeFilterFn, Integer.MAX_VALUE);
} finally {
xmlReader.close();
}
}
});
}
//-------------------------------------------------------------------------
/**
* Parses the tree from the StAX stream reader, capturing references.
* <p>
* The reader should be created using the factory returned from {@link #xmlInputFactory()}.
* <p>
* This method supports capturing attribute references, such as an id/href pair.
* Wherever the parser finds an attribute with the specified name, the element is added
* to the specified map. Note that the map is mutated.
*
* @param reader the StAX stream reader, positioned at or before the element to be parsed
* @param refAttr the attribute name that should be parsed as a reference, null if not applicable
* @param refs the mutable map of references to update, null if not applicable
* @return the parsed element
* @throws IllegalArgumentException if the input cannot be parsed
*/
private static XmlElement parse(XMLStreamReader reader, String refAttr, Map<String, XmlElement> refs) {
try {
// parse start element
String elementName = parseElementName(reader);
ImmutableMap<String, String> attrs = parseAttributes(reader);
// parse children or content
ImmutableList.Builder<XmlElement> childBuilder = ImmutableList.builder();
String content = "";
int event = reader.next();
while (event != XMLStreamConstants.END_ELEMENT) {
switch (event) {
// parse child when start element found
case XMLStreamConstants.START_ELEMENT:
childBuilder.add(parse(reader, refAttr, refs));
break;
// append content when characters found
// since XMLStreamReader has IS_COALESCING=true means there should only be one content call
case XMLStreamConstants.CHARACTERS:
case XMLStreamConstants.CDATA:
content += reader.getText();
break;
default:
break;
}
event = reader.next();
}
ImmutableList<XmlElement> children = childBuilder.build();
XmlElement parsed = children.isEmpty() ?
XmlElement.ofContent(elementName, attrs, content) :
XmlElement.ofChildren(elementName, attrs, children);
String ref = attrs.get(refAttr);
if (ref != null) {
refs.put(ref, parsed);
}
return parsed;
} catch (XMLStreamException ex) {
throw new IllegalArgumentException(ex);
}
}
// parses the element structure from the input, filtering as necessary
private static XmlElement parseElements(XMLStreamReader reader, ToIntFunction<String> filterFn, int currentLevel) {
try {
// parse start element
String elementName = parseElementName(reader);
// parse children or content
ImmutableList.Builder<XmlElement> childBuilder = ImmutableList.builder();
int event = reader.next();
while (event != XMLStreamConstants.END_ELEMENT) {
if (event == XMLStreamConstants.START_ELEMENT) {
int childLevel = currentLevel == Integer.MAX_VALUE ? filterFn.applyAsInt(elementName) : currentLevel - 1;
XmlElement child = parseElements(reader, filterFn, childLevel);
if (childLevel > 0) {
childBuilder.add(child);
}
}
event = reader.next();
}
ImmutableList<XmlElement> children = childBuilder.build();
XmlElement parsed = children.isEmpty() ?
XmlElement.ofContent(elementName, "") :
XmlElement.ofChildren(elementName, children);
return parsed;
} catch (XMLStreamException ex) {
throw new IllegalArgumentException(ex);
}
}
// find the start element and parses the name
private static String parseElementName(XMLStreamReader reader) throws XMLStreamException {
int event = reader.getEventType();
while (event != XMLStreamConstants.START_ELEMENT) {
event = reader.next();
}
return reader.getLocalName();
}
// parses attributes into a map
private static ImmutableMap<String, String> parseAttributes(XMLStreamReader reader) {
ImmutableMap<String, String> attrs;
int attributeCount = reader.getAttributeCount() + reader.getNamespaceCount();
if (attributeCount == 0) {
attrs = ImmutableMap.of();
} else {
ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
for (int i = 0; i < reader.getAttributeCount(); i++) {
builder.put(reader.getAttributeLocalName(i), reader.getAttributeValue(i));
}
attrs = builder.build();
}
return attrs;
}
//-------------------------------------------------------------------------
// creates the XML input factory, recreated each time to avoid JDK-8028111
// this also provides some protection against hackers attacking XML
private static XMLInputFactory xmlInputFactory() {
// see https://bugs.openjdk.java.net/browse/JDK-8183519 where JDK deprecated the wrong method
// to avoid a warning on 9 this code uses newInstance() even though newFactory() is more correct
// there is no difference in behavior between the two methods
XMLInputFactory factory = XMLInputFactory.newInstance();
factory.setProperty(XMLInputFactory.IS_COALESCING, true);
factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true);
factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
return factory;
}
//-------------------------------------------------------------------------
/**
* Restricted constructor.
*/
private XmlFile(XmlElement root, Map<String, XmlElement> refs) {
this.root = ArgChecker.notNull(root, "root");
this.refs = ImmutableMap.copyOf(refs);
}
//-------------------------------------------------------------------------
/**
* Gets the root element of this file.
*
* @return the root element
*/
public XmlElement getRoot() {
return root;
}
/**
* Gets the reference map of id to element.
* <p>
* This is used to decode references, such as an id/href pair.
* <p>
* For example, if one part of the XML has {@code <foo id="fooId">}, the map will
* contain an entry mapping "fooId" to the parsed element {@code <foo>}.
*
* @return the map of id to element
*/
public ImmutableMap<String, XmlElement> getReferences() {
return refs;
}
//-------------------------------------------------------------------------
/**
* Checks if this file equals another.
* <p>
* The comparison checks the content and reference map.
*
* @param obj the other section, null returns false
* @return true if equal
*/
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof XmlFile) {
XmlFile other = (XmlFile) obj;
return root.equals(other.root) && refs.equals(other.refs);
}
return false;
}
/**
* Returns a suitable hash code for the file.
*
* @return the hash code
*/
@Override
public int hashCode() {
return root.hashCode() ^ refs.hashCode();
}
/**
* Returns a string describing the file.
*
* @return the descriptive string
*/
@Override
public String toString() {
return root.toString();
}
}