' == input.charAt(end + 1)) {
break;
} else if ('>' == ch || '=' == ch
- || Character.isWhitespace(ch)) {
+ || isAsciiWhitespace(ch)) {
break;
} else if ('"' == ch || '\'' == ch) {
if (end + 1 < limit) {
char ch2 = input.charAt(end + 1);
- if (Character.isWhitespace(ch2)
+ if (isAsciiWhitespace(ch2)
|| ch2 == '>' || ch2 == '/') {
++end;
break;
@@ -554,7 +554,7 @@ private HtmlToken parseToken() {
} else {
// We skip whitespace tokens inside tag bodies.
type = HtmlTokenType.IGNORABLE;
- while (end < limit && Character.isWhitespace(input.charAt(end))) {
+ while (end < limit && isAsciiWhitespace(input.charAt(end))) {
++end;
}
}
@@ -604,7 +604,7 @@ private HtmlToken parseToken() {
ch = input.charAt(end);
switch (state) {
case TAGNAME:
- if (Character.isWhitespace(ch)
+ if (isAsciiWhitespace(ch)
|| '>' == ch || '/' == ch || '<' == ch) {
// End processing of an escape exempt block when we see
// a corresponding end tag.
@@ -749,6 +749,17 @@ private String canonicalElementName(int start, int end) {
return HtmlLexer.canonicalElementName(input.substring(start, end));
}
+ /**
+ * Test if a character is an ASCII whitespace according to the HTML rules. Other Unicode whitespace characters do not count.
+ *
+ * @param ch the character to test
+ *
+ * @return true if it is one of TAB, LF, FF, CR or SPACE
+ */
+ private static boolean isAsciiWhitespace(int ch) {
+ return (ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\r') || (ch == '\f');
+ }
+
private static boolean isIdentStart(char ch) {
return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
}
diff --git a/src/test/java/org/owasp/html/ElidedCharactersTest.java b/src/test/java/org/owasp/html/ElidedCharactersTest.java
new file mode 100644
index 00000000..21e98d83
--- /dev/null
+++ b/src/test/java/org/owasp/html/ElidedCharactersTest.java
@@ -0,0 +1,143 @@
+package org.owasp.html;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import junit.framework.TestCase;
+import org.junit.Test;
+
+/**
+ * Some characters should not appear in HTML documents, present risks for log-file injection, or are otherwise discouraged from sanitized HTML. This set of
+ * unit tests verifies that the inclusion of such characters does not allow dangerous code to slip through.
+ *
+ * There are two requirements:
+ *
+ * 1) The Encoding.encodeRcdataOnto method should remove discouraged characters.
+ * 2) Sanitized HTML should not change
+ *
+ * @author Simon Greatrix on 25/01/2021.
+ */
+public class ElidedCharactersTest extends TestCase {
+
+ /** List of all characters that are discouraged in HTML. */
+ static List DISCOURAGED;
+
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromTagStart() throws IOException {
+ //
is an unrecognised tag and an unmatched end tag
+ for (String d : DISCOURAGED) {
+ String test = "<" + d+"h1>";
+ String html = Sanitizers.BLOCKS.sanitize(test);
+ String m = String.format("Use in of U+%06x", d.codePointAt(0));
+ assertEquals(m, "<h1>", html);
+ }
+
+ String html = Sanitizers.BLOCKS.sanitize("");
+ assertEquals("",html);
+ }
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromInsideTag() throws IOException {
+ //
is an unrecognised tag and an unmatched end tag
+ for (String d : DISCOURAGED) {
+ String test = "";
+ String html = Sanitizers.BLOCKS.sanitize(test);
+ String m = String.format("Use in of U+%06x", d.codePointAt(0));
+ assertEquals(m, "", html);
+ }
+
+ String html = Sanitizers.BLOCKS.sanitize("");
+ assertEquals("",html);
+ }
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromTagEnd() throws IOException {
+ //
is an unrecognised tag and an unmatched end tag
+ for (String d : DISCOURAGED) {
+ String test = "";
+ String html = Sanitizers.BLOCKS.sanitize(test);
+ String m = String.format("Use in of U+%06x", d.codePointAt(0));
+ assertEquals(m, "", html);
+ }
+
+ String html = Sanitizers.BLOCKS.sanitize("");
+ assertEquals("",html);
+ }
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromEndWhenEncoding() throws IOException {
+ for (String d : DISCOURAGED) {
+ String test = "Hello" + d;
+ StringBuilder builder = new StringBuilder();
+ Encoding.encodePcdataOnto(test, builder);
+ String m = String.format("Elision of U+%06x", d.codePointAt(0));
+ assertEquals(m, "Hello", builder.toString());
+ }
+ }
+
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromMiddleWhenEncoding() throws IOException {
+ for (String d : DISCOURAGED) {
+ String test = "Hel" + d + "lo";
+ StringBuilder builder = new StringBuilder();
+ Encoding.encodePcdataOnto(test, builder);
+ String m = String.format("Elision of U+%06x", d.codePointAt(0));
+ assertEquals(m, "Hello", builder.toString());
+ }
+ }
+
+
+ @Test
+ public static final void testRemoveDiscouragedCharacterFromStartWhenEncoding() throws IOException {
+ for (String d : DISCOURAGED) {
+ String test = d + "Hello";
+ StringBuilder builder = new StringBuilder();
+ Encoding.encodePcdataOnto(test, builder);
+ String m = String.format("Elision of U+%06x", d.codePointAt(0));
+ assertEquals(m, "Hello", builder.toString());
+ }
+ }
+
+
+ static {
+ ArrayList list = new ArrayList();
+
+ // C0 characters banned by XML, except for the three official whitespace characters
+ for (char i = 0; i <= 0x1f; i++) {
+ if (i != 0x9 && i != 0xa && i != 0xd && i!=0xc) {
+ list.add(Character.toString(i));
+ }
+ }
+
+ // Delete character and C1 escapes which are discouraged by XML and banned as HTML numeric escapes. Also discouraging the U+0085 NEL characters.
+ for (char i = 0x7f; i <= 0x9f; i++) {
+ list.add(Character.toString(i));
+ }
+
+ // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained.
+ for (char i = 0xd800; i <= 0xdfff; i++) {
+ list.add(Character.toString(i));
+ }
+
+ // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained.
+ for (char i = 0xfdd0; i <= 0xfdef; i++) {
+ list.add(Character.toString(i));
+ }
+
+ list.add(Character.toString((char) 0xfffe));
+ list.add(Character.toString((char) 0xffff));
+
+ // Non-characters from the supplemental planes
+ for (int i = 1; i <= 16; i++) {
+ list.add(new String(Character.toChars(0x10000 * i + 0xfffe)));
+ list.add(new String(Character.toChars(0x10000 * i + 0xffff)));
+ }
+
+ DISCOURAGED = Collections.unmodifiableList(list);
+ }
+
+}
diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java
index eea7769a..86223db3 100644
--- a/src/test/java/org/owasp/html/EncodingTest.java
+++ b/src/test/java/org/owasp/html/EncodingTest.java
@@ -28,6 +28,11 @@
package org.owasp.html;
+import java.io.IOException;
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.HashSet;
+
import org.junit.Test;
import junit.framework.TestCase;
@@ -207,6 +212,29 @@ public static final void testDecodeHtml() {
assertEquals(
"&bogus;",
Encoding.decodeHtml("&bogus;"));
+
+ assertEquals(
+ "lt<",
+ Encoding.decodeHtml("lt<"));
+ assertEquals(
+ "ltlt;",
+ Encoding.decodeHtml("ltlt;"));
+ assertEquals(
+ "lt<",
+ Encoding.decodeHtml("lt<"));
+ assertEquals(
+ "lt&<",
+ Encoding.decodeHtml("lt&<"));
+
+ assertEquals(
+ "lt&<gt",
+ Encoding.decodeHtml("\ufdddlt&&l\ufffet;\udc9c\ud835gt"));
+ assertEquals(
+ "lt&<",
+ Encoding.decodeHtml("lt&<\udc9c"));
+ assertEquals(
+ "lt&<",
+ Encoding.decodeHtml("lt&<\ud835"));
}
@Test
@@ -214,9 +242,10 @@ public static final void testAppendNumericEntityAndEncodeOnto()
throws Exception {
StringBuilder sb = new StringBuilder();
StringBuilder cps = new StringBuilder();
+ // Test with a set of legal code points
for (int codepoint : new int[] {
- 0, 9, '\n', '@', 0x80, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xffff,
- 0x10000, Character.MAX_CODE_POINT }) {
+ 9, '\n', '@', 0xa0, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xfffd,
+ 0x10000, Character.MAX_CODE_POINT-2 }) {
Encoding.appendNumericEntity(codepoint, sb);
sb.append(' ');
@@ -224,18 +253,43 @@ public static final void testAppendNumericEntityAndEncodeOnto()
}
assertEquals(
- "
@ ÿ Ā က "
- + "ሺ 𐀀 ",
+ "
@ ÿ Ā က "
+ + "ሺ � 𐀀 ",
sb.toString());
StringBuilder out = new StringBuilder();
Encoding.encodeHtmlAttribOnto(cps.toString(), out);
assertEquals(
- " \t \n @ \u0080 \u00ff \u0100 \u0fff \u1000 "
- + "\u123a 𐀀 ",
+ "\t \n @ \u00a0 \u00ff \u0100 \u0fff \u1000 "
+ + "\u123a \ufffd 𐀀 ",
out.toString());
}
+ @Test
+ public static final void testAppendIllegalNumericEntityAndEncodeOnto()
+ throws Exception {
+ StringBuilder sb = new StringBuilder();
+ StringBuilder cps = new StringBuilder();
+ // Test with a set of legal code points
+ for (int codepoint : new int[] { 8, '\r', 0x7f, 0x85, 0xfdd0, 0xfffe, 0x1fffe, 0x3ffff }) {
+ try {
+ Encoding.appendNumericEntity(codepoint, sb);
+ fail("Illegal character was accepted: "+codepoint);
+ } catch ( IllegalArgumentException e ) {
+ // expected behaviour
+ }
+
+ cps.appendCodePoint(codepoint).append(',');
+ }
+
+ assertEquals("", sb.toString());
+
+ StringBuilder out = new StringBuilder();
+ Encoding.encodeHtmlAttribOnto(cps.toString(), out);
+ assertEquals(
+ ",\n,,,,,,,",
+ out.toString());
+ }
@Test
public static final void testAngularJsBracesInTextNode() throws Exception {
StringBuilder sb = new StringBuilder();
@@ -276,9 +330,21 @@ public static final void testStripBannedCodeunits() {
assertStripped("foo\ud800\udc00bar", "foo\udc00\ud800\udc00bar");
assertStripped("foo\ud834\udd1ebar", "foo\ud834\udd1ebar");
assertStripped("foo\ud834\udd1e", "foo\ud834\udd1e");
- assertStripped("\uffef\ufffd", "\uffef\ufffd\ufffe\uffff");
+
+ // Check stripping of non-characters from all planes
+ for(int i=0;i<=16;i++) {
+ int o = 0x10000 * i;
+ String s = new StringBuilder().append(String.format("%02x",i)).appendCodePoint(o+0xffef).appendCodePoint(o+0xfffd)
+ .appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).toString();
+ String t = s.substring(0,(i==0)?4:6);
+ assertStripped(t,s);
+
+ s = new StringBuilder().append("foo").appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).append("bar").toString();
+ assertStripped("foobar",s);
+ }
}
+
@Test
public static final
void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes()
@@ -305,4 +371,66 @@ void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes()
Encoding.encodeHtmlAttribOnto("a nonce=xyz ", attrib);
assertEquals("a nonce=xyz ", attrib.toString());
}
+
+ @Test
+ public static final void testRiskyNormalizationSetContents() {
+ // Test that the risky normalization set contains the expected values
+ for(char toTest='\u0080'; toTest<'\ufffe'; toTest++) {
+ boolean isRisky = false;
+ String decomposed = Normalizer.normalize(Character.toString(toTest), Form.NFKD);
+ for(int i=0;i\u200C\u09C1",
- "\u09B8\u09CD\u09B0\u09C1",
- },
- {
- "\u0C1C\u0C4D\u0C1E\u200C\u0C3E",
- "\u0C1C\u0C4D\u0C1E\u0C3E",
- },
- {
- "\u09B8\u09CD\u09B0\u200C\u09C1",
- "\u09B8\u09CD\u09B0\u09C1",
- },
- {
- "జ్ఞా",
- "\u0C1C\u0C4D\u0C1E\u0C3E",
- },
- {
- "జ్ఞా",
- "\u0C1C\u0C4D\u0C1E\u0C3E",
- },
- {
- "স্রু",
- "\u09B8\u09CD\u09B0\u09C1",
- },
- {
- "স্রু",
- "\u09B8\u09CD\u09B0\u09C1",
- },
- {
- "\u0915\u094D\u0930\u200C\u093E",
- "\u0915\u094D\u0930\u093E",
- },
- };
-
- for (int i = 0, n = tests.length; i < n; ++i) {
- String[] test = tests[i];
- assertEquals(i + " : " + test[0], test[1], sanitize(test[0]));
- }
- }
private static String sanitize(@Nullable String html) {
StringBuilder sb = new StringBuilder();
diff --git a/src/test/java/org/owasp/html/SanitizersTest.java b/src/test/java/org/owasp/html/SanitizersTest.java
index c75fbcb4..32092d20 100644
--- a/src/test/java/org/owasp/html/SanitizersTest.java
+++ b/src/test/java/org/owasp/html/SanitizersTest.java
@@ -313,7 +313,10 @@ public static final void testScriptInTable() {
.and(Sanitizers.STYLES)
.and(Sanitizers.IMAGES)
.and(Sanitizers.TABLES);
- assertEquals("Hallo\r\n\nEnde\n\r", pf.sanitize(input));
+ // The CRLF after "Hallo" becomes LF
+ // The LF before "Ende" becomes LF
+ // The LF CR after "Ende" becomes LF LF
+ assertEquals("Hallo\n\nEnde\n\n", pf.sanitize(input));
}
@Test