diff --git a/parent/pom.xml b/parent/pom.xml index 7ef44a68..e80abc22 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -88,7 +88,7 @@ application while protecting against XSS. UTF-8 UTF-8 - 27.1-jre + 30.1-jre diff --git a/scripts/build_for_travis.sh b/scripts/build_for_travis.sh index 5400a2f5..1aec8fd0 100755 --- a/scripts/build_for_travis.sh +++ b/scripts/build_for_travis.sh @@ -35,7 +35,7 @@ if [ -n "$IS_LEGACY" ]; then else # Build the whole kit-n-kaboodle. mvn -f aggregate/pom.xml source:jar javadoc:jar verify $COMMON_FLAGS \ - && mvn -Dguava.version=27.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \ + && mvn -Dguava.version=30.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \ && mvn jacoco:report coveralls:report \ && mvn org.sonatype.ossindex.maven:ossindex-maven-plugin:audit -f aggregate $COMMON_FLAGS fi diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 4a2a601f..94fbde98 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -29,7 +29,9 @@ package org.owasp.html; import java.io.IOException; - +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import javax.annotation.Nullable; /** Encoders and decoders for HTML. */ @@ -94,6 +96,7 @@ static void stripBannedCodeunits(StringBuilder sb) { stripBannedCodeunits(sb, 0); } + @TCB private static void stripBannedCodeunits(StringBuilder sb, int start) { int k = start; @@ -108,13 +111,16 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) { if (i+1 < n) { char next = sb.charAt(i+1); if (Character.isSurrogatePair(ch, next)) { - sb.setCharAt(k++, ch); - sb.setCharAt(k++, next); + // The last two code points in each plane are non-characters that should be elided. + if ((ch & 0xfc3f) != 0xd83f || (next & 0xfffe) != 0xdffe) { + sb.setCharAt(k++, ch); + sb.setCharAt(k++, next); + } ++i; } } continue; - } else if ((ch & 0xfffe) == 0xfffe) { + } else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) { continue; } } @@ -139,19 +145,34 @@ private static int longestPrefixOfGoodCodeunits(String s) { } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { - if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { - ++i; // Skip over low surrogate since we know it's ok. + if (i + 1 < n ) { + // could be a surrogate pair + char cn = s.charAt(i+1); + if( Character.isSurrogatePair(ch,cn) ) { + int cp = Character.toCodePoint(ch, cn); + // Could be a non-character + if ((cp & 0xfffe) == 0xfffe) { + // not valid + return i; + } + + // skip over trailing surrogate since we know it is OK + i++; + } else { + // not a surrogate pair + return i; + } } else { + // isolated surrogate at end of string return i; } - } else if ((ch & 0xfffe) == 0xfffe) { + } else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) { return i; } } } return -1; } - /** * Appends an encoded form of plainText to output where the encoding is * sufficient to prevent an HTML parser from interpreting any characters in @@ -196,6 +217,7 @@ static void encodePcdataOnto(String plainText, Appendable output) encodeHtmlOnto(plainText, output, "{"); } + /** * Appends an encoded form of plainText to putput where the encoding is * sufficient to prevent an HTML parser from transitioning out of the @@ -240,127 +262,94 @@ private static void encodeHtmlOnto( char ch = plainText.charAt(i); if (ch < REPLACEMENTS.length) { // Handles all ASCII. String repl = REPLACEMENTS[ch]; - if (ch == '{' && repl == null) { - if (i + 1 == n || plainText.charAt(i + 1) == '{') { - repl = braceReplacement; + if( repl==null ) { + if (ch == '{') { + if (i + 1 == n || plainText.charAt(i + 1) == '{') { + // "{{" detected, so use the brace replacement + repl = braceReplacement; + } + } + if (ch == '\r') { + // If this CR is followed by a LF, just remove it. Otherwise replace it with a LF. + if (i + 1 == n || plainText.charAt(i + 1) != '\n' ) { + // CR not followed by LF, so turn into LF + repl = "\n"; + } else { + // CRLF, so remove CR + repl = ""; + } } } if (repl != null) { output.append(plainText, pos, i).append(repl); pos = i + 1; } - } else if ((0x93A <= ch && ch <= 0xC4C) - && ( - // Devanagari vowel - ch <= 0x94F - // Benagli vowels - || 0x985 <= ch && ch <= 0x994 - || 0x9BE <= ch && ch < 0x9CC // 0x9CC (Bengali AU) is ok - || 0x9E0 <= ch && ch <= 0x9E3 - // Telugu vowels - || 0xC05 <= ch && ch <= 0xC14 - || 0xC3E <= ch && ch != 0xC48 /* 0xC48 (Telugu AI) is ok */)) { - // https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/ - // > So, ultimately, the full set of cases that cause the crash are: - // > Any sequence - // > in Devanagari, Bengali, and Telugu, where: ... - - // TODO: This is needed as of February 2018, but hopefully not long after that. - // We eliminate the ZWNJ which seems the minimally damaging thing to do to - // Telugu rendering per the article above: - // > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts. - - if (pos < i) { - if (plainText.charAt(i - 1) == 0x200C /* ZWNJ */) { - output.append(plainText, pos, i - 1); - // Drop the ZWNJ on the floor. - pos = i; - } - } else if (output instanceof StringBuilder) { - StringBuilder sb = (StringBuilder) output; - int len = sb.length(); - if (len != 0) { - if (sb.charAt(len - 1) == 0x200C /* ZWNJ */) { - sb.setLength(len - 1); - } - } - } - } else if (((char) 0xd800) <= ch) { - if (ch <= ((char) 0xdfff)) { - char next; - if (i + 1 < n - && Character.isSurrogatePair( - ch, next = plainText.charAt(i + 1))) { - // Emit supplemental codepoints as entity so that they cannot - // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper - // and get involved in UTF-16/UCS-2 confusion. - int codepoint = Character.toCodePoint(ch, next); - output.append(plainText, pos, i); + } else if (RISKY_NORMALIZATION.contains(ch)) { + // Application of unicode compatibility normalization produces a risky character. + output.append(plainText, pos, i); + pos = i + 1; + appendNumericEntity(ch,output); + } else if ((ch <= 0x9f) || (0xfdd0 <= ch && ch <= 0xfdef) || ((ch & 0xfffe) == 0xfffe)) { + // Elide C1 escapes and BMP non-characters. + output.append(plainText, pos, i); + pos = i + 1; + } else if (0xd800 <= ch && ch <= 0xdfff) { + // handle surrogates + char next; + if (i + 1 < n && Character.isSurrogatePair(ch, next = plainText.charAt(i + 1))) { + // Emit supplemental codepoints as entity so that they cannot + // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper + // and get involved in UTF-16/UCS-2 confusion. + int codepoint = Character.toCodePoint(ch, next); + output.append(plainText, pos, i); + // do not append 0xfffe and 0xffff from any plane + if( (codepoint & 0xfffe) != 0xfffe ) { appendNumericEntity(codepoint, output); - ++i; - pos = i + 1; - } else { - output.append(plainText, pos, i); - // Elide the orphaned surrogate. - pos = i + 1; } - } else if (0xfe60 <= ch) { - // Is a control character or possible full-width version of a - // special character, a BOM, or one of the FE60 block that might - // be elided or normalized to an HTML special character. - // Running - // cat NormalizationText.txt \ - // | perl -pe 's/ ?#.*//' \ - // | egrep '(;003C(;|$)|003E|0026|0022|0027|0060)' - // dumps a list of code-points that can normalize to HTML special - // characters. + ++i; + pos = i + 1; + } else { output.append(plainText, pos, i); + // Elide the orphaned surrogate. pos = i + 1; - if ((ch & 0xfffe) == 0xfffe) { - // Elide since not an the XML Character. - } else { - appendNumericEntity(ch, output); - } } - } else if (ch == '\u1FEF') { // Normalizes to backtick. - output.append(plainText, pos, i).append("`"); - pos = i + 1; } } output.append(plainText, pos, n); } + + /** + * Append a codepoint to the output as a numeric entity. + * + * @param codepoint the codepoint + * @param output the output + * + * @throws IOException if the output cannot be written to + * @throws IllegalArgumentException if the codepoint cannot be represented as a numeric escape. + */ @TCB static void appendNumericEntity(int codepoint, Appendable output) throws IOException { + if (((codepoint <= 0x1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) { + throw new IllegalArgumentException("Illegal numeric escape. Cannot represent control code: " + codepoint); + } + if ((0xfdd0 <= codepoint && codepoint <= 0xfdef) || ((codepoint & 0xfffe) == 0xfffe)) { + throw new IllegalArgumentException("Illegal numeric escape. Cannot represent non-character: " + codepoint); + } + output.append("&#"); if (codepoint < 100) { - // TODO: is this dead code due to REPLACEMENTS above. - if (codepoint < 10) { - output.append((char) ('0' + codepoint)); - } else { - output.append((char) ('0' + (codepoint / 10))); - output.append((char) ('0' + (codepoint % 10))); - } + // Below 100, a decimal representation is shortest + output.append(Integer.toString(codepoint)); } else { - int nDigits = (codepoint < 0x1000 - ? codepoint < 0x100 ? 2 : 3 - : (codepoint < 0x10000 ? 4 - : codepoint < 0x100000 ? 5 : 6)); + // Append a hexadecimal value output.append('x'); - for (int digit = nDigits; --digit >= 0;) { - int hexDigit = (codepoint >>> (digit << 2)) & 0xf; - output.append(HEX_NUMERAL[hexDigit]); - } + output.append(Integer.toHexString(codepoint)); } output.append(";"); } - private static final char[] HEX_NUMERAL = { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', - }; - /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ private static final String[] REPLACEMENTS = new String[0x80]; static { @@ -385,17 +374,41 @@ static void appendNumericEntity(int codepoint, Appendable output) REPLACEMENTS['>'] = ">"; // HTML special. REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. + REPLACEMENTS['\u007f'] = ""; // Elide delete } /** * IS_BANNED_ASCII[i] where is an ASCII control character codepoint (< 0x20) * is true for control characters that are not allowed in an XML source text. */ - private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; + private static final boolean[] IS_BANNED_ASCII = new boolean[0x20]; static { for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); } } + /** Set of all Unicode characters which when processed with unicode compatibility decomposition will include a non-alphanumeric ascii character. */ + static final Set RISKY_NORMALIZATION; + static { + HashSet set = new HashSet(); + + // These characters all decompose riskily + String singles = "\u037e\u1fef\u203c\u207a\u208a\u2100\u2101\u2105\u2106\u2260\u226e\u226f\u33c2\u33c7\u33d8\ufb29\ufe10\ufe19\ufe30\ufe47\ufe48\ufe52"; + for(char ch : singles.toCharArray()) { + set.add(ch); + } + + // This string is composed of pairs of characters defining inclusive start and end ranges. + String pairs = + "\u2024\u2026\u2047\u2049\u207c\u207e\u208c\u208e\u2474\u24b5\u2a74\u2a76\u3200\u321e\u3220\u3243\ufe13\ufe16\ufe33" + + "\ufe38\ufe4d\ufe50\ufe54\ufe57\ufe59\ufe5c\ufe5f\ufe66\ufe68\ufe6b\uff01\uff0f\uff1a\uff20\uff3b\uff40\uff5b\uff5e"; + for(int i=0;i' == input.charAt(end + 1)) { break; } else if ('>' == ch || '=' == ch - || Character.isWhitespace(ch)) { + || isAsciiWhitespace(ch)) { break; } else if ('"' == ch || '\'' == ch) { if (end + 1 < limit) { char ch2 = input.charAt(end + 1); - if (Character.isWhitespace(ch2) + if (isAsciiWhitespace(ch2) || ch2 == '>' || ch2 == '/') { ++end; break; @@ -554,7 +554,7 @@ private HtmlToken parseToken() { } else { // We skip whitespace tokens inside tag bodies. type = HtmlTokenType.IGNORABLE; - while (end < limit && Character.isWhitespace(input.charAt(end))) { + while (end < limit && isAsciiWhitespace(input.charAt(end))) { ++end; } } @@ -604,7 +604,7 @@ private HtmlToken parseToken() { ch = input.charAt(end); switch (state) { case TAGNAME: - if (Character.isWhitespace(ch) + if (isAsciiWhitespace(ch) || '>' == ch || '/' == ch || '<' == ch) { // End processing of an escape exempt block when we see // a corresponding end tag. @@ -749,6 +749,17 @@ private String canonicalElementName(int start, int end) { return HtmlLexer.canonicalElementName(input.substring(start, end)); } + /** + * Test if a character is an ASCII whitespace according to the HTML rules. Other Unicode whitespace characters do not count. + * + * @param ch the character to test + * + * @return true if it is one of TAB, LF, FF, CR or SPACE + */ + private static boolean isAsciiWhitespace(int ch) { + return (ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\r') || (ch == '\f'); + } + private static boolean isIdentStart(char ch) { return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); } diff --git a/src/test/java/org/owasp/html/ElidedCharactersTest.java b/src/test/java/org/owasp/html/ElidedCharactersTest.java new file mode 100644 index 00000000..21e98d83 --- /dev/null +++ b/src/test/java/org/owasp/html/ElidedCharactersTest.java @@ -0,0 +1,143 @@ +package org.owasp.html; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import junit.framework.TestCase; +import org.junit.Test; + +/** + * Some characters should not appear in HTML documents, present risks for log-file injection, or are otherwise discouraged from sanitized HTML. This set of + * unit tests verifies that the inclusion of such characters does not allow dangerous code to slip through. + *

+ * There are two requirements: + *

+ * 1) The Encoding.encodeRcdataOnto method should remove discouraged characters. + * 2) Sanitized HTML should not change + * + * @author Simon Greatrix on 25/01/2021. + */ +public class ElidedCharactersTest extends TestCase { + + /** List of all characters that are discouraged in HTML. */ + static List DISCOURAGED; + + + @Test + public static final void testRemoveDiscouragedCharacterFromTagStart() throws IOException { + //

is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = "<" + d+"h1>"; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in

of U+%06x", d.codePointAt(0)); + assertEquals(m, "<h1>", html); + } + + String html = Sanitizers.BLOCKS.sanitize("

"); + assertEquals("

",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromInsideTag() throws IOException { + // is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = ""; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in

of U+%06x", d.codePointAt(0)); + assertEquals(m, "", html); + } + + String html = Sanitizers.BLOCKS.sanitize("

"); + assertEquals("

",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromTagEnd() throws IOException { + // is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = ""; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in

of U+%06x", d.codePointAt(0)); + assertEquals(m, "", html); + } + + String html = Sanitizers.BLOCKS.sanitize("

"); + assertEquals("

",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromEndWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = "Hello" + d; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + @Test + public static final void testRemoveDiscouragedCharacterFromMiddleWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = "Hel" + d + "lo"; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + @Test + public static final void testRemoveDiscouragedCharacterFromStartWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = d + "Hello"; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + static { + ArrayList list = new ArrayList(); + + // C0 characters banned by XML, except for the three official whitespace characters + for (char i = 0; i <= 0x1f; i++) { + if (i != 0x9 && i != 0xa && i != 0xd && i!=0xc) { + list.add(Character.toString(i)); + } + } + + // Delete character and C1 escapes which are discouraged by XML and banned as HTML numeric escapes. Also discouraging the U+0085 NEL characters. + for (char i = 0x7f; i <= 0x9f; i++) { + list.add(Character.toString(i)); + } + + // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained. + for (char i = 0xd800; i <= 0xdfff; i++) { + list.add(Character.toString(i)); + } + + // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained. + for (char i = 0xfdd0; i <= 0xfdef; i++) { + list.add(Character.toString(i)); + } + + list.add(Character.toString((char) 0xfffe)); + list.add(Character.toString((char) 0xffff)); + + // Non-characters from the supplemental planes + for (int i = 1; i <= 16; i++) { + list.add(new String(Character.toChars(0x10000 * i + 0xfffe))); + list.add(new String(Character.toChars(0x10000 * i + 0xffff))); + } + + DISCOURAGED = Collections.unmodifiableList(list); + } + +} diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java index eea7769a..86223db3 100644 --- a/src/test/java/org/owasp/html/EncodingTest.java +++ b/src/test/java/org/owasp/html/EncodingTest.java @@ -28,6 +28,11 @@ package org.owasp.html; +import java.io.IOException; +import java.text.Normalizer; +import java.text.Normalizer.Form; +import java.util.HashSet; + import org.junit.Test; import junit.framework.TestCase; @@ -207,6 +212,29 @@ public static final void testDecodeHtml() { assertEquals( "&bogus;", Encoding.decodeHtml("&bogus;")); + + assertEquals( + "lt<", + Encoding.decodeHtml("lt<")); + assertEquals( + "ltlt;", + Encoding.decodeHtml("ltlt;")); + assertEquals( + "lt<", + Encoding.decodeHtml("lt&lt;")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<")); + + assertEquals( + "lt&<gt", + Encoding.decodeHtml("\ufdddlt&&l\ufffet;\udc9c\ud835gt")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<\udc9c")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<\ud835")); } @Test @@ -214,9 +242,10 @@ public static final void testAppendNumericEntityAndEncodeOnto() throws Exception { StringBuilder sb = new StringBuilder(); StringBuilder cps = new StringBuilder(); + // Test with a set of legal code points for (int codepoint : new int[] { - 0, 9, '\n', '@', 0x80, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xffff, - 0x10000, Character.MAX_CODE_POINT }) { + 9, '\n', '@', 0xa0, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xfffd, + 0x10000, Character.MAX_CODE_POINT-2 }) { Encoding.appendNumericEntity(codepoint, sb); sb.append(' '); @@ -224,18 +253,43 @@ public static final void testAppendNumericEntityAndEncodeOnto() } assertEquals( - "� @ € ÿ Ā ࿿ က " - + "ሺ ￿ 𐀀 􏿿 ", + " @   ÿ Ā ࿿ က " + + "ሺ � 𐀀 􏿽 ", sb.toString()); StringBuilder out = new StringBuilder(); Encoding.encodeHtmlAttribOnto(cps.toString(), out); assertEquals( - " \t \n @ \u0080 \u00ff \u0100 \u0fff \u1000 " - + "\u123a 𐀀 􏿿 ", + "\t \n @ \u00a0 \u00ff \u0100 \u0fff \u1000 " + + "\u123a \ufffd 𐀀 􏿽 ", out.toString()); } + @Test + public static final void testAppendIllegalNumericEntityAndEncodeOnto() + throws Exception { + StringBuilder sb = new StringBuilder(); + StringBuilder cps = new StringBuilder(); + // Test with a set of legal code points + for (int codepoint : new int[] { 8, '\r', 0x7f, 0x85, 0xfdd0, 0xfffe, 0x1fffe, 0x3ffff }) { + try { + Encoding.appendNumericEntity(codepoint, sb); + fail("Illegal character was accepted: "+codepoint); + } catch ( IllegalArgumentException e ) { + // expected behaviour + } + + cps.appendCodePoint(codepoint).append(','); + } + + assertEquals("", sb.toString()); + + StringBuilder out = new StringBuilder(); + Encoding.encodeHtmlAttribOnto(cps.toString(), out); + assertEquals( + ",\n,,,,,,,", + out.toString()); + } @Test public static final void testAngularJsBracesInTextNode() throws Exception { StringBuilder sb = new StringBuilder(); @@ -276,9 +330,21 @@ public static final void testStripBannedCodeunits() { assertStripped("foo\ud800\udc00bar", "foo\udc00\ud800\udc00bar"); assertStripped("foo\ud834\udd1ebar", "foo\ud834\udd1ebar"); assertStripped("foo\ud834\udd1e", "foo\ud834\udd1e"); - assertStripped("\uffef\ufffd", "\uffef\ufffd\ufffe\uffff"); + + // Check stripping of non-characters from all planes + for(int i=0;i<=16;i++) { + int o = 0x10000 * i; + String s = new StringBuilder().append(String.format("%02x",i)).appendCodePoint(o+0xffef).appendCodePoint(o+0xfffd) + .appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).toString(); + String t = s.substring(0,(i==0)?4:6); + assertStripped(t,s); + + s = new StringBuilder().append("foo").appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).append("bar").toString(); + assertStripped("foobar",s); + } } + @Test public static final void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes() @@ -305,4 +371,66 @@ void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes() Encoding.encodeHtmlAttribOnto("a nonce=xyz ", attrib); assertEquals("a nonce=xyz ", attrib.toString()); } + + @Test + public static final void testRiskyNormalizationSetContents() { + // Test that the risky normalization set contains the expected values + for(char toTest='\u0080'; toTest<'\ufffe'; toTest++) { + boolean isRisky = false; + String decomposed = Normalizer.normalize(Character.toString(toTest), Form.NFKD); + for(int i=0;i\u200C\u09C1", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "\u0C1C\u0C4D\u0C1E\u200C\u0C3E", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "\u09B8\u09CD\u09B0\u200C\u09C1", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "జ్ఞ‌ా", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "జ్ఞ‌ా", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "স্র‌ু", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "স্র‌ু", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "\u0915\u094D\u0930\u200C\u093E", - "\u0915\u094D\u0930\u093E", - }, - }; - - for (int i = 0, n = tests.length; i < n; ++i) { - String[] test = tests[i]; - assertEquals(i + " : " + test[0], test[1], sanitize(test[0])); - } - } private static String sanitize(@Nullable String html) { StringBuilder sb = new StringBuilder(); diff --git a/src/test/java/org/owasp/html/SanitizersTest.java b/src/test/java/org/owasp/html/SanitizersTest.java index c75fbcb4..32092d20 100644 --- a/src/test/java/org/owasp/html/SanitizersTest.java +++ b/src/test/java/org/owasp/html/SanitizersTest.java @@ -313,7 +313,10 @@ public static final void testScriptInTable() { .and(Sanitizers.STYLES) .and(Sanitizers.IMAGES) .and(Sanitizers.TABLES); - assertEquals("
Hallo\r\n\nEnde\n\r", pf.sanitize(input)); + // The CRLF after "Hallo" becomes LF + // The LF before "Ende" becomes LF + // The LF CR after "Ende" becomes LF LF + assertEquals("
Hallo\n\nEnde\n\n", pf.sanitize(input)); } @Test