From 4077b1990a46179415597f56090729013a4b2a60 Mon Sep 17 00:00:00 2001 From: Simon Greatrix Date: Fri, 29 Jan 2021 18:08:10 +0000 Subject: [PATCH 1/8] end-of-day --- src/main/java/org/owasp/html/Encoding.java | 262 ++++++++++-------- src/main/java/org/owasp/html/HtmlLexer.java | 21 +- .../org/owasp/html/ElidedCharactersTest.java | 143 ++++++++++ .../java/org/owasp/html/EncodingTest.java | 26 +- .../org/owasp/html/HtmlSanitizerTest.java | 47 ---- 5 files changed, 328 insertions(+), 171 deletions(-) create mode 100644 src/test/java/org/owasp/html/ElidedCharactersTest.java diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 4a2a601f..d6c38aa8 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -29,7 +29,9 @@ package org.owasp.html; import java.io.IOException; - +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import javax.annotation.Nullable; /** Encoders and decoders for HTML. */ @@ -45,7 +47,9 @@ public final class Encoding { public static String decodeHtml(String s) { int firstAmp = s.indexOf('&'); int safeLimit = longestPrefixOfGoodCodeunits(s); - if ((firstAmp & safeLimit) < 0) { return s; } + if ((firstAmp & safeLimit) < 0) { + return s; + } StringBuilder sb; { @@ -65,9 +69,9 @@ public static String decodeHtml(String s) { stripBannedCodeunits( sb, firstAmp < 0 - ? safeLimit : safeLimit < 0 - ? firstAmp : Math.min(firstAmp, safeLimit)); - + ? safeLimit : safeLimit < 0 + ? firstAmp : Math.min(firstAmp, safeLimit) + ); return sb.toString(); } @@ -94,6 +98,7 @@ static void stripBannedCodeunits(StringBuilder sb) { stripBannedCodeunits(sb, 0); } + @TCB private static void stripBannedCodeunits(StringBuilder sb, int start) { int k = start; @@ -105,11 +110,14 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) { } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { - if (i+1 < n) { - char next = sb.charAt(i+1); + if (i + 1 < n) { + char next = sb.charAt(i + 1); if (Character.isSurrogatePair(ch, next)) { - sb.setCharAt(k++, ch); - sb.setCharAt(k++, next); + // The last two code points in each plane are non-characters that should be elided. + if ((ch & 0xfc3f) != 0xd83f || (next & 0xfffe) != 0xdffe) { + sb.setCharAt(k++, ch); + sb.setCharAt(k++, next); + } ++i; } } @@ -139,29 +147,44 @@ private static int longestPrefixOfGoodCodeunits(String s) { } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { - if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { - ++i; // Skip over low surrogate since we know it's ok. + if (i + 1 < n ) { + // could be a surrogate pair + char cn = s.charAt(i+1); + if( Character.isSurrogatePair(ch,cn) ) { + int cp = Character.toCodePoint(ch, cn); + // Could be a non-character + if ((cp & 0xfffe) == 0xfffe) { + // not valid + return i; + } + + // skip over trailing surrogate since we know it is OK + i++; + } else { + // not a surrogate pair + return i; + } } else { + // isolated surrogate at end of string return i; } - } else if ((ch & 0xfffe) == 0xfffe) { + } else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) { return i; } } } return -1; } - /** * Appends an encoded form of plainText to output where the encoding is * sufficient to prevent an HTML parser from interpreting any characters in * the appended chunk as part of an attribute or tag boundary. * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * ends after the open-quote of an attribute value and does not yet contain - * a corresponding close quote. - * Modified in place. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * ends after the open-quote of an attribute value and does not yet contain + * a corresponding close quote. + * Modified in place. */ static void encodeHtmlAttribOnto(String plainText, Appendable output) throws IOException { @@ -180,11 +203,11 @@ static void encodeHtmlAttribOnto(String plainText, Appendable output) * step 4.) * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * would leave an HTML parser in the Data state if it were to encounter a space - * character as the next character. In practice this means that the buffer - * does not contain partial tags or comments, and does not have an unclosed - * element with a special content model. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * would leave an HTML parser in the Data state if it were to encounter a space + * character as the next character. In practice this means that the buffer + * does not contain partial tags or comments, and does not have an unclosed + * element with a special content model. */ static void encodePcdataOnto(String plainText, Appendable output) throws IOException { @@ -196,6 +219,7 @@ static void encodePcdataOnto(String plainText, Appendable output) encodeHtmlOnto(plainText, output, "{"); } + /** * Appends an encoded form of plainText to putput where the encoding is * sufficient to prevent an HTML parser from transitioning out of the @@ -206,11 +230,11 @@ static void encodePcdataOnto(String plainText, Appendable output) * {@code } element outside foreign content. * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * would leave an HTML parser in the Data state if it were to encounter a space - * character as the next character. In practice this means that the buffer - * does not contain partial tags or comments, and the most recently opened - * element is `<textarea>` or `<title>` and that element is still open. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * would leave an HTML parser in the Data state if it were to encounter a space + * character as the next character. In practice this means that the buffer + * does not contain partial tags or comments, and the most recently opened + * element is `<textarea>` or `<title>` and that element is still open. */ public static void encodeRcdataOnto(String plainText, Appendable output) throws IOException { @@ -232,8 +256,9 @@ public static void encodeRcdataOnto(String plainText, Appendable output) */ @TCB private static void encodeHtmlOnto( - String plainText, Appendable output, @Nullable String braceReplacement) - throws IOException { + String plainText, Appendable output, @Nullable String braceReplacement + ) + throws IOException { int n = plainText.length(); int pos = 0; for (int i = 0; i < n; ++i) { @@ -249,54 +274,25 @@ private static void encodeHtmlOnto( output.append(plainText, pos, i).append(repl); pos = i + 1; } - } else if ((0x93A <= ch && ch <= 0xC4C) - && ( - // Devanagari vowel - ch <= 0x94F - // Benagli vowels - || 0x985 <= ch && ch <= 0x994 - || 0x9BE <= ch && ch < 0x9CC // 0x9CC (Bengali AU) is ok - || 0x9E0 <= ch && ch <= 0x9E3 - // Telugu vowels - || 0xC05 <= ch && ch <= 0xC14 - || 0xC3E <= ch && ch != 0xC48 /* 0xC48 (Telugu AI) is ok */)) { - // https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/ - // > So, ultimately, the full set of cases that cause the crash are: - // > Any sequence <consonant1, virama, consonant2, ZWNJ, vowel> - // > in Devanagari, Bengali, and Telugu, where: ... - - // TODO: This is needed as of February 2018, but hopefully not long after that. - // We eliminate the ZWNJ which seems the minimally damaging thing to do to - // Telugu rendering per the article above: - // > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts. - - if (pos < i) { - if (plainText.charAt(i - 1) == 0x200C /* ZWNJ */) { - output.append(plainText, pos, i - 1); - // Drop the ZWNJ on the floor. - pos = i; - } - } else if (output instanceof StringBuilder) { - StringBuilder sb = (StringBuilder) output; - int len = sb.length(); - if (len != 0) { - if (sb.charAt(len - 1) == 0x200C /* ZWNJ */) { - sb.setLength(len - 1); - } - } - } + } else if ((ch <= 0x9f) || (0xfdd0 <= ch && ch <= 0xfdef) || ((ch & 0xfffe) == 0xfffe)) { + // Elide C1 escapes and BMP non-characters. + output.append(plainText, pos, i); + pos = i + 1; } else if (((char) 0xd800) <= ch) { if (ch <= ((char) 0xdfff)) { char next; if (i + 1 < n && Character.isSurrogatePair( - ch, next = plainText.charAt(i + 1))) { + ch, next = plainText.charAt(i + 1))) { // Emit supplemental codepoints as entity so that they cannot // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper // and get involved in UTF-16/UCS-2 confusion. int codepoint = Character.toCodePoint(ch, next); output.append(plainText, pos, i); - appendNumericEntity(codepoint, output); + // do not append 0xfffe and 0xffff from any plane + if( (codepoint & 0xfffe) != 0xfffe ) { + appendNumericEntity(codepoint, output); + } ++i; pos = i + 1; } else { @@ -304,63 +300,59 @@ private static void encodeHtmlOnto( // Elide the orphaned surrogate. pos = i + 1; } - } else if (0xfe60 <= ch) { - // Is a control character or possible full-width version of a - // special character, a BOM, or one of the FE60 block that might - // be elided or normalized to an HTML special character. - // Running - // cat NormalizationText.txt \ - // | perl -pe 's/ ?#.*//' \ - // | egrep '(;003C(;|$)|003E|0026|0022|0027|0060)' - // dumps a list of code-points that can normalize to HTML special - // characters. + } else if ((0xfe50 <= ch && ch<=0xfe6f) || (0xff01 <= ch && ch <= 0xff5e)) { + // The Unicode block U+FE50 to U+FE6F contains the "small form variants" of '<', '>', '&', ';', '=', '#' and others. + // To prevent these characters being interpreted as their normal forms, we encode the entire block as numeric escapes. + // + // The Unicode range U+FF01 to U+FF5E is the part of the "Halfwidth and Fullwidth forms" which contains the fullwidth version + // of every ASCII character from '!' to '~'. To prevent these characters being interpreted as their normal forms, we encode this + // part of the block as numeric escapes. output.append(plainText, pos, i); pos = i + 1; - if ((ch & 0xfffe) == 0xfffe) { - // Elide since not an the XML Character. - } else { - appendNumericEntity(ch, output); - } + appendNumericEntity(ch, output); } - } else if (ch == '\u1FEF') { // Normalizes to backtick. - output.append(plainText, pos, i).append("`"); + } else if (BAD_NORMALIZATION.contains(Character.valueOf(ch))) { + // Application of a unicode normalization produces a risky character + output.append(plainText, pos, i); pos = i + 1; + appendNumericEntity(ch,output); } } output.append(plainText, pos, n); } + + /** + * Append a codepoint to the output as a numeric entity. + * + * @param codepoint the codepoint + * @param output the output + * + * @throws IOException if the output cannot be written to + * @throws IllegalArgumentException if the codepoint cannot be represented as a numeric escape. + */ @TCB static void appendNumericEntity(int codepoint, Appendable output) throws IOException { + if (((codepoint <= 1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) { + throw new IllegalArgumentException("Illegal numeric escape. Cannot represent control code: " + codepoint); + } + if ((0xfdd0 <= codepoint && codepoint <= 0xfdef) || ((codepoint & 0xfffe) == 0xfffe)) { + throw new IllegalArgumentException("Illegal numeric escape. Cannot represent non-character: " + codepoint); + } + output.append("&#"); if (codepoint < 100) { - // TODO: is this dead code due to REPLACEMENTS above. - if (codepoint < 10) { - output.append((char) ('0' + codepoint)); - } else { - output.append((char) ('0' + (codepoint / 10))); - output.append((char) ('0' + (codepoint % 10))); - } + // Below 100, a decimal representation is shortest + output.append(Integer.toString(codepoint)); } else { - int nDigits = (codepoint < 0x1000 - ? codepoint < 0x100 ? 2 : 3 - : (codepoint < 0x10000 ? 4 - : codepoint < 0x100000 ? 5 : 6)); + // Append a hexadecimal value output.append('x'); - for (int digit = nDigits; --digit >= 0;) { - int hexDigit = (codepoint >>> (digit << 2)) & 0xf; - output.append(HEX_NUMERAL[hexDigit]); - } + output.append(Integer.toHexString(codepoint)); } output.append(";"); } - private static final char[] HEX_NUMERAL = { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', - }; - /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ private static final String[] REPLACEMENTS = new String[0x80]; static { @@ -375,16 +367,17 @@ static void appendNumericEntity(int codepoint, Appendable output) } } // """ is shorter than """ - REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. - REPLACEMENTS['&'] = "&"; // HTML special. + REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. + REPLACEMENTS['&'] = "&"; // HTML special. // We don't use ' since that is not in the intersection of HTML&XML. REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. - REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. - REPLACEMENTS['<'] = "<"; // HTML special. - REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. - REPLACEMENTS['>'] = ">"; // HTML special. - REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. - REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. + REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. + REPLACEMENTS['<'] = "<"; // HTML special. + REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. + REPLACEMENTS['>'] = ">"; // HTML special. + REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. + REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. + REPLACEMENTS['\u007f'] = ""; // Elide DELETE } /** @@ -398,4 +391,49 @@ static void appendNumericEntity(int codepoint, Appendable output) } } + /** Set of all Unicode characters which when normalized may contain one of the ASCII characters that require a numeric escape. */ + private static Set<Character> BAD_NORMALIZATION; + static { + // This list was produced by attempting all forms of normalization on all Unicode code points and identifying those whose normalized form contains one of + // these nine characters: '"', ''', '`', '&', '<', '>', '+', '=', ';', '/'. + HashSet<Character> set = new HashSet<Character>(); + set.add('\u037e'); // GREEK QUESTION MARK normalizes to ';' + set.add('\u1fef'); // GREEK VARIA normalizes to '`' + set.add('\u207a'); // SUPERSCRIPT PLUS SIGN normalizes to '+' + set.add('\u207c'); // SUPERSCRIPT EQUALS SIGN normalizes to '=' + set.add('\u208a'); // SUBSCRIPT PLUS SIGN normalizes to '+' + set.add('\u208c'); // SUBSCRIPT EQUALS SIGN normalizes to '=' + set.add('\u2100'); // ACCOUNT OF normalizes to '/' + set.add('\u2101'); // ADDRESSED TO THE SUBJECT normalizes to '/' + set.add('\u2105'); // CARE OF normalizes to '/' + set.add('\u2106'); // CADA UNA normalizes to '/' + set.add('\u2260'); // NOT EQUAL TO normalizes to '=' + set.add('\u226e'); // NOT LESS-THAN normalizes to '<' + set.add('\u226f'); // NOT GREATER-THAN normalizes to '>' + set.add('\u2a74'); // DOUBLE COLON EQUAL normalizes to '=' + set.add('\u2a75'); // TWO CONSECUTIVE EQUALS SIGNS normalizes to '=' + set.add('\u2a76'); // THREE CONSECUTIVE EQUALS SIGNS normalizes to '=' + set.add('\ufb29'); // HEBREW LETTER ALTERNATIVE PLUS SIGN normalizes to '+' + set.add('\ufe14'); // PRESENTATION FORM FOR VERTICAL SEMICOLON normalizes to ';' + set.add('\ufe54'); // SMALL SEMICOLON normalizes to ';' + set.add('\ufe5f'); // SMALL NUMBER SIGN normalizes to '#' + set.add('\ufe60'); // SMALL AMPERSAND normalizes to '&' + set.add('\ufe62'); // SMALL PLUS SIGN normalizes to '+' + set.add('\ufe64'); // SMALL LESS-THAN SIGN normalizes to '<' + set.add('\ufe65'); // SMALL GREATER-THAN SIGN normalizes to '>' + set.add('\ufe66'); // SMALL EQUALS SIGN normalizes to '=' + set.add('\uff02'); // FULLWIDTH QUOTATION MARK normalizes to '"' + set.add('\uff03'); // FULLWIDTH NUMBER SIGN normalizes to '#' + set.add('\uff06'); // FULLWIDTH AMPERSAND normalizes to '&' + set.add('\uff07'); // FULLWIDTH APOSTROPHE normalizes to ''' + set.add('\uff0b'); // FULLWIDTH PLUS SIGN normalizes to '+' + set.add('\uff0f'); // FULLWIDTH SOLIDUS normalizes to '/' + set.add('\uff1b'); // FULLWIDTH SEMICOLON normalizes to ';' + set.add('\uff1c'); // FULLWIDTH LESS-THAN SIGN normalizes to '<' + set.add('\uff1d'); // FULLWIDTH EQUALS SIGN normalizes to '=' + set.add('\uff1e'); // FULLWIDTH GREATER-THAN SIGN normalizes to '>' + set.add('\uff40'); // FULLWIDTH GRAVE ACCENT normalizes to '`' + + BAD_NORMALIZATION = Collections.unmodifiableSet(set); + } } diff --git a/src/main/java/org/owasp/html/HtmlLexer.java b/src/main/java/org/owasp/html/HtmlLexer.java index 00fcc7dc..376c939c 100644 --- a/src/main/java/org/owasp/html/HtmlLexer.java +++ b/src/main/java/org/owasp/html/HtmlLexer.java @@ -527,7 +527,7 @@ private HtmlToken parseToken() { break; } } - } else if (!Character.isWhitespace(ch)) { + } else if (!isAsciiWhitespace(ch)) { type = HtmlTokenType.TEXT; for (; end < limit; ++end) { ch = input.charAt(end); @@ -538,12 +538,12 @@ private HtmlToken parseToken() { && '>' == input.charAt(end + 1)) { break; } else if ('>' == ch || '=' == ch - || Character.isWhitespace(ch)) { + || isAsciiWhitespace(ch)) { break; } else if ('"' == ch || '\'' == ch) { if (end + 1 < limit) { char ch2 = input.charAt(end + 1); - if (Character.isWhitespace(ch2) + if (isAsciiWhitespace(ch2) || ch2 == '>' || ch2 == '/') { ++end; break; @@ -554,7 +554,7 @@ private HtmlToken parseToken() { } else { // We skip whitespace tokens inside tag bodies. type = HtmlTokenType.IGNORABLE; - while (end < limit && Character.isWhitespace(input.charAt(end))) { + while (end < limit && isAsciiWhitespace(input.charAt(end))) { ++end; } } @@ -604,7 +604,7 @@ private HtmlToken parseToken() { ch = input.charAt(end); switch (state) { case TAGNAME: - if (Character.isWhitespace(ch) + if (isAsciiWhitespace(ch) || '>' == ch || '/' == ch || '<' == ch) { // End processing of an escape exempt block when we see // a corresponding end tag. @@ -749,6 +749,17 @@ private String canonicalElementName(int start, int end) { return HtmlLexer.canonicalElementName(input.substring(start, end)); } + /** + * Test if a character is an ASCII whitespace according to the HTML rules. Other Unicode whitespace characters do not count. + * + * @param ch the character to test + * + * @return true if it is one of TAB, LF, FF, CR or SPACE + */ + private static boolean isAsciiWhitespace(int ch) { + return (ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\r') || (ch == '\f'); + } + private static boolean isIdentStart(char ch) { return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); } diff --git a/src/test/java/org/owasp/html/ElidedCharactersTest.java b/src/test/java/org/owasp/html/ElidedCharactersTest.java new file mode 100644 index 00000000..21e98d83 --- /dev/null +++ b/src/test/java/org/owasp/html/ElidedCharactersTest.java @@ -0,0 +1,143 @@ +package org.owasp.html; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import junit.framework.TestCase; +import org.junit.Test; + +/** + * Some characters should not appear in HTML documents, present risks for log-file injection, or are otherwise discouraged from sanitized HTML. This set of + * unit tests verifies that the inclusion of such characters does not allow dangerous code to slip through. + * <p> + * There are two requirements: + * <p> + * 1) The Encoding.encodeRcdataOnto method should remove discouraged characters. + * 2) Sanitized HTML should not change + * + * @author Simon Greatrix on 25/01/2021. + */ +public class ElidedCharactersTest extends TestCase { + + /** List of all characters that are discouraged in HTML. */ + static List<String> DISCOURAGED; + + + @Test + public static final void testRemoveDiscouragedCharacterFromTagStart() throws IOException { + // <Xp></p> is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = "<" + d+"h1></h1>"; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in <h1> of U+%06x", d.codePointAt(0)); + assertEquals(m, "<h1>", html); + } + + String html = Sanitizers.BLOCKS.sanitize("<h1></h1>"); + assertEquals("<h1></h1>",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromInsideTag() throws IOException { + // <h1X></h1> is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = "<h"+d+"1></h1>"; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in <h1> of U+%06x", d.codePointAt(0)); + assertEquals(m, "", html); + } + + String html = Sanitizers.BLOCKS.sanitize("<h1></h1>"); + assertEquals("<h1></h1>",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromTagEnd() throws IOException { + // <h1X></h1> is an unrecognised tag and an unmatched end tag + for (String d : DISCOURAGED) { + String test = "<h1"+ d+"></h1>"; + String html = Sanitizers.BLOCKS.sanitize(test); + String m = String.format("Use in <h1> of U+%06x", d.codePointAt(0)); + assertEquals(m, "", html); + } + + String html = Sanitizers.BLOCKS.sanitize("<h1></h1>"); + assertEquals("<h1></h1>",html); + } + + @Test + public static final void testRemoveDiscouragedCharacterFromEndWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = "Hello" + d; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + @Test + public static final void testRemoveDiscouragedCharacterFromMiddleWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = "Hel" + d + "lo"; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + @Test + public static final void testRemoveDiscouragedCharacterFromStartWhenEncoding() throws IOException { + for (String d : DISCOURAGED) { + String test = d + "Hello"; + StringBuilder builder = new StringBuilder(); + Encoding.encodePcdataOnto(test, builder); + String m = String.format("Elision of U+%06x", d.codePointAt(0)); + assertEquals(m, "Hello", builder.toString()); + } + } + + + static { + ArrayList<String> list = new ArrayList<String>(); + + // C0 characters banned by XML, except for the three official whitespace characters + for (char i = 0; i <= 0x1f; i++) { + if (i != 0x9 && i != 0xa && i != 0xd && i!=0xc) { + list.add(Character.toString(i)); + } + } + + // Delete character and C1 escapes which are discouraged by XML and banned as HTML numeric escapes. Also discouraging the U+0085 NEL characters. + for (char i = 0x7f; i <= 0x9f; i++) { + list.add(Character.toString(i)); + } + + // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained. + for (char i = 0xd800; i <= 0xdfff; i++) { + list.add(Character.toString(i)); + } + + // Isolated surrogates. NB Must also test that valid non-isolated surrogates are retained. + for (char i = 0xfdd0; i <= 0xfdef; i++) { + list.add(Character.toString(i)); + } + + list.add(Character.toString((char) 0xfffe)); + list.add(Character.toString((char) 0xffff)); + + // Non-characters from the supplemental planes + for (int i = 1; i <= 16; i++) { + list.add(new String(Character.toChars(0x10000 * i + 0xfffe))); + list.add(new String(Character.toChars(0x10000 * i + 0xffff))); + } + + DISCOURAGED = Collections.unmodifiableList(list); + } + +} diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java index eea7769a..dee3e2d3 100644 --- a/src/test/java/org/owasp/html/EncodingTest.java +++ b/src/test/java/org/owasp/html/EncodingTest.java @@ -215,8 +215,8 @@ public static final void testAppendNumericEntityAndEncodeOnto() StringBuilder sb = new StringBuilder(); StringBuilder cps = new StringBuilder(); for (int codepoint : new int[] { - 0, 9, '\n', '@', 0x80, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xffff, - 0x10000, Character.MAX_CODE_POINT }) { + 9, '\n', '@', 0xa0, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xfffd, + 0x10000, Character.MAX_CODE_POINT-2 }) { Encoding.appendNumericEntity(codepoint, sb); sb.append(' '); @@ -224,15 +224,15 @@ public static final void testAppendNumericEntityAndEncodeOnto() } assertEquals( - "� @ € ÿ Ā ࿿ က " - + "ሺ ￿ 𐀀 􏿿 ", + " @   ÿ Ā ࿿ က " + + "ሺ � 𐀀 􏿽 ", sb.toString()); StringBuilder out = new StringBuilder(); Encoding.encodeHtmlAttribOnto(cps.toString(), out); assertEquals( - " \t \n @ \u0080 \u00ff \u0100 \u0fff \u1000 " - + "\u123a 𐀀 􏿿 ", + "\t \n @ \u00a0 \u00ff \u0100 \u0fff \u1000 " + + "\u123a \ufffd 𐀀 􏿽 ", out.toString()); } @@ -276,9 +276,21 @@ public static final void testStripBannedCodeunits() { assertStripped("foo\ud800\udc00bar", "foo\udc00\ud800\udc00bar"); assertStripped("foo\ud834\udd1ebar", "foo\ud834\udd1ebar"); assertStripped("foo\ud834\udd1e", "foo\ud834\udd1e"); - assertStripped("\uffef\ufffd", "\uffef\ufffd\ufffe\uffff"); + + // Check stripping of non-characters from all planes + for(int i=0;i<=16;i++) { + int o = 0x10000 * i; + String s = new StringBuilder().append(String.format("%02x",i)).appendCodePoint(o+0xffef).appendCodePoint(o+0xfffd) + .appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).toString(); + String t = s.substring(0,(i==0)?4:6); + assertStripped(t,s); + + s = new StringBuilder().append("foo").appendCodePoint(o+0xfffe).appendCodePoint(o+0xffff).append("bar").toString(); + assertStripped("foobar",s); + } } + @Test public static final void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes() diff --git a/src/test/java/org/owasp/html/HtmlSanitizerTest.java b/src/test/java/org/owasp/html/HtmlSanitizerTest.java index 53ff9270..0ce72f68 100644 --- a/src/test/java/org/owasp/html/HtmlSanitizerTest.java +++ b/src/test/java/org/owasp/html/HtmlSanitizerTest.java @@ -392,53 +392,6 @@ public static final void testNbsps() { codeUnits)); } - @Test - public static final void testMacOSAndIOSQueryOfDeath() { - // https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/ - String[][] tests = { - { - "\u0C1C\u0C4D\u0C1E\u200C\u0C3E", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "\u09B8\u09CD\u09B0<interrupted>\u200C\u09C1", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "\u0C1C\u0C4D\u0C1E\u200C\u0C3E", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "\u09B8\u09CD\u09B0\u200C<interrupted>\u09C1", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "జ్ఞ‌ా", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "జ్ఞ<interrupted>‌ా", - "\u0C1C\u0C4D\u0C1E\u0C3E", - }, - { - "স্র‌ু", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "স্র‌<interrupted>ু", - "\u09B8\u09CD\u09B0\u09C1", - }, - { - "\u0915\u094D\u0930\u200C\u093E", - "\u0915\u094D\u0930\u093E", - }, - }; - - for (int i = 0, n = tests.length; i < n; ++i) { - String[] test = tests[i]; - assertEquals(i + " : " + test[0], test[1], sanitize(test[0])); - } - } private static String sanitize(@Nullable String html) { StringBuilder sb = new StringBuilder(); From 348b7aa24d8127627428c9cab22d036eb72d9cf6 Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 17:33:59 +0000 Subject: [PATCH 2/8] End of day --- src/main/java/org/owasp/html/Encoding.java | 21 ++++++++++++--- .../java/org/owasp/html/EncodingTest.java | 27 +++++++++++++++++++ .../java/org/owasp/html/SanitizersTest.java | 5 +++- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index d6c38aa8..80b4454f 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -265,9 +265,22 @@ private static void encodeHtmlOnto( char ch = plainText.charAt(i); if (ch < REPLACEMENTS.length) { // Handles all ASCII. String repl = REPLACEMENTS[ch]; - if (ch == '{' && repl == null) { - if (i + 1 == n || plainText.charAt(i + 1) == '{') { - repl = braceReplacement; + if( repl==null ) { + if (ch == '{') { + if (i + 1 == n || plainText.charAt(i + 1) == '{') { + // "{{" detected, so use the brace replacement + repl = braceReplacement; + } + } + if (ch == '\r') { + // If this CR is followed by a LF, just remove it. Otherwise replace it with a LF. + if (i + 1 == n || plainText.charAt(i + 1) != '\n' ) { + // CR not followed by LF, so turn into LF + repl = "\n"; + } else { + // CRLF, so remove CR + repl = ""; + } } } if (repl != null) { @@ -334,7 +347,7 @@ private static void encodeHtmlOnto( @TCB static void appendNumericEntity(int codepoint, Appendable output) throws IOException { - if (((codepoint <= 1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) { + if (((codepoint <= 0x1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) { throw new IllegalArgumentException("Illegal numeric escape. Cannot represent control code: " + codepoint); } if ((0xfdd0 <= codepoint && codepoint <= 0xfdef) || ((codepoint & 0xfffe) == 0xfffe)) { diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java index dee3e2d3..cecd4e06 100644 --- a/src/test/java/org/owasp/html/EncodingTest.java +++ b/src/test/java/org/owasp/html/EncodingTest.java @@ -214,6 +214,7 @@ public static final void testAppendNumericEntityAndEncodeOnto() throws Exception { StringBuilder sb = new StringBuilder(); StringBuilder cps = new StringBuilder(); + // Test with a set of legal code points for (int codepoint : new int[] { 9, '\n', '@', 0xa0, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xfffd, 0x10000, Character.MAX_CODE_POINT-2 }) { @@ -236,6 +237,31 @@ public static final void testAppendNumericEntityAndEncodeOnto() out.toString()); } + @Test + public static final void testAppendIllegalNumericEntityAndEncodeOnto() + throws Exception { + StringBuilder sb = new StringBuilder(); + StringBuilder cps = new StringBuilder(); + // Test with a set of legal code points + for (int codepoint : new int[] { 8, '\r', 0x7f, 0x85, 0xfdd0, 0xfffe, 0x1fffe, 0x3ffff }) { + try { + Encoding.appendNumericEntity(codepoint, sb); + fail("Illegal character was accepted: "+codepoint); + } catch ( IllegalArgumentException e ) { + // expected behaviour + } + + cps.appendCodePoint(codepoint).append(','); + } + + assertEquals("", sb.toString()); + + StringBuilder out = new StringBuilder(); + Encoding.encodeHtmlAttribOnto(cps.toString(), out); + assertEquals( + ",\n,,,,,,,", + out.toString()); + } @Test public static final void testAngularJsBracesInTextNode() throws Exception { StringBuilder sb = new StringBuilder(); @@ -317,4 +343,5 @@ void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes() Encoding.encodeHtmlAttribOnto("a nonce=xyz ", attrib); assertEquals("a nonce=xyz ", attrib.toString()); } + } diff --git a/src/test/java/org/owasp/html/SanitizersTest.java b/src/test/java/org/owasp/html/SanitizersTest.java index c75fbcb4..32092d20 100644 --- a/src/test/java/org/owasp/html/SanitizersTest.java +++ b/src/test/java/org/owasp/html/SanitizersTest.java @@ -313,7 +313,10 @@ public static final void testScriptInTable() { .and(Sanitizers.STYLES) .and(Sanitizers.IMAGES) .and(Sanitizers.TABLES); - assertEquals("<table></table>Hallo\r\n\nEnde\n\r", pf.sanitize(input)); + // The CRLF after "Hallo" becomes LF + // The LF before "Ende" becomes LF + // The LF CR after "Ende" becomes LF LF + assertEquals("<table></table>Hallo\n\nEnde\n\n", pf.sanitize(input)); } @Test From dedafc78ebe6c9b9e493afcaa215dc51a31b761a Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 20:57:06 +0000 Subject: [PATCH 3/8] Finalizing encoding changes and tests --- src/main/java/org/owasp/html/Encoding.java | 121 +++++++----------- .../java/org/owasp/html/EncodingTest.java | 89 +++++++++++++ 2 files changed, 133 insertions(+), 77 deletions(-) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 80b4454f..10c037af 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -29,6 +29,8 @@ package org.owasp.html; import java.io.IOException; +import java.text.Normalizer; +import java.text.Normalizer.Form; import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -122,7 +124,7 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) { } } continue; - } else if ((ch & 0xfffe) == 0xfffe) { + } else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) { continue; } } @@ -287,48 +289,35 @@ private static void encodeHtmlOnto( output.append(plainText, pos, i).append(repl); pos = i + 1; } + } else if (RISKY_NORMALIZATION.contains(ch)) { + // Application of unicode compatibility normalization produces a risky character. + output.append(plainText, pos, i); + pos = i + 1; + appendNumericEntity(ch,output); } else if ((ch <= 0x9f) || (0xfdd0 <= ch && ch <= 0xfdef) || ((ch & 0xfffe) == 0xfffe)) { // Elide C1 escapes and BMP non-characters. output.append(plainText, pos, i); pos = i + 1; - } else if (((char) 0xd800) <= ch) { - if (ch <= ((char) 0xdfff)) { - char next; - if (i + 1 < n - && Character.isSurrogatePair( - ch, next = plainText.charAt(i + 1))) { - // Emit supplemental codepoints as entity so that they cannot - // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper - // and get involved in UTF-16/UCS-2 confusion. - int codepoint = Character.toCodePoint(ch, next); - output.append(plainText, pos, i); - // do not append 0xfffe and 0xffff from any plane - if( (codepoint & 0xfffe) != 0xfffe ) { - appendNumericEntity(codepoint, output); - } - ++i; - pos = i + 1; - } else { - output.append(plainText, pos, i); - // Elide the orphaned surrogate. - pos = i + 1; + } else if (0xd800 <= ch && ch <= 0xdfff) { + // handle surrogates + char next; + if (i + 1 < n && Character.isSurrogatePair(ch, next = plainText.charAt(i + 1))) { + // Emit supplemental codepoints as entity so that they cannot + // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper + // and get involved in UTF-16/UCS-2 confusion. + int codepoint = Character.toCodePoint(ch, next); + output.append(plainText, pos, i); + // do not append 0xfffe and 0xffff from any plane + if( (codepoint & 0xfffe) != 0xfffe ) { + appendNumericEntity(codepoint, output); } - } else if ((0xfe50 <= ch && ch<=0xfe6f) || (0xff01 <= ch && ch <= 0xff5e)) { - // The Unicode block U+FE50 to U+FE6F contains the "small form variants" of '<', '>', '&', ';', '=', '#' and others. - // To prevent these characters being interpreted as their normal forms, we encode the entire block as numeric escapes. - // - // The Unicode range U+FF01 to U+FF5E is the part of the "Halfwidth and Fullwidth forms" which contains the fullwidth version - // of every ASCII character from '!' to '~'. To prevent these characters being interpreted as their normal forms, we encode this - // part of the block as numeric escapes. + ++i; + pos = i + 1; + } else { output.append(plainText, pos, i); + // Elide the orphaned surrogate. pos = i + 1; - appendNumericEntity(ch, output); } - } else if (BAD_NORMALIZATION.contains(Character.valueOf(ch))) { - // Application of a unicode normalization produces a risky character - output.append(plainText, pos, i); - pos = i + 1; - appendNumericEntity(ch,output); } } output.append(plainText, pos, n); @@ -397,56 +386,34 @@ static void appendNumericEntity(int codepoint, Appendable output) * IS_BANNED_ASCII[i] where is an ASCII control character codepoint (< 0x20) * is true for control characters that are not allowed in an XML source text. */ - private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; + private static final boolean[] IS_BANNED_ASCII = new boolean[0x20]; static { for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); } } - /** Set of all Unicode characters which when normalized may contain one of the ASCII characters that require a numeric escape. */ - private static Set<Character> BAD_NORMALIZATION; + /** Set of all Unicode characters which when processed with unicode compatibility decomposition will include a non-alphanumeric ascii character. */ + static final Set<Character> RISKY_NORMALIZATION; static { - // This list was produced by attempting all forms of normalization on all Unicode code points and identifying those whose normalized form contains one of - // these nine characters: '"', ''', '`', '&', '<', '>', '+', '=', ';', '/'. HashSet<Character> set = new HashSet<Character>(); - set.add('\u037e'); // GREEK QUESTION MARK normalizes to ';' - set.add('\u1fef'); // GREEK VARIA normalizes to '`' - set.add('\u207a'); // SUPERSCRIPT PLUS SIGN normalizes to '+' - set.add('\u207c'); // SUPERSCRIPT EQUALS SIGN normalizes to '=' - set.add('\u208a'); // SUBSCRIPT PLUS SIGN normalizes to '+' - set.add('\u208c'); // SUBSCRIPT EQUALS SIGN normalizes to '=' - set.add('\u2100'); // ACCOUNT OF normalizes to '/' - set.add('\u2101'); // ADDRESSED TO THE SUBJECT normalizes to '/' - set.add('\u2105'); // CARE OF normalizes to '/' - set.add('\u2106'); // CADA UNA normalizes to '/' - set.add('\u2260'); // NOT EQUAL TO normalizes to '=' - set.add('\u226e'); // NOT LESS-THAN normalizes to '<' - set.add('\u226f'); // NOT GREATER-THAN normalizes to '>' - set.add('\u2a74'); // DOUBLE COLON EQUAL normalizes to '=' - set.add('\u2a75'); // TWO CONSECUTIVE EQUALS SIGNS normalizes to '=' - set.add('\u2a76'); // THREE CONSECUTIVE EQUALS SIGNS normalizes to '=' - set.add('\ufb29'); // HEBREW LETTER ALTERNATIVE PLUS SIGN normalizes to '+' - set.add('\ufe14'); // PRESENTATION FORM FOR VERTICAL SEMICOLON normalizes to ';' - set.add('\ufe54'); // SMALL SEMICOLON normalizes to ';' - set.add('\ufe5f'); // SMALL NUMBER SIGN normalizes to '#' - set.add('\ufe60'); // SMALL AMPERSAND normalizes to '&' - set.add('\ufe62'); // SMALL PLUS SIGN normalizes to '+' - set.add('\ufe64'); // SMALL LESS-THAN SIGN normalizes to '<' - set.add('\ufe65'); // SMALL GREATER-THAN SIGN normalizes to '>' - set.add('\ufe66'); // SMALL EQUALS SIGN normalizes to '=' - set.add('\uff02'); // FULLWIDTH QUOTATION MARK normalizes to '"' - set.add('\uff03'); // FULLWIDTH NUMBER SIGN normalizes to '#' - set.add('\uff06'); // FULLWIDTH AMPERSAND normalizes to '&' - set.add('\uff07'); // FULLWIDTH APOSTROPHE normalizes to ''' - set.add('\uff0b'); // FULLWIDTH PLUS SIGN normalizes to '+' - set.add('\uff0f'); // FULLWIDTH SOLIDUS normalizes to '/' - set.add('\uff1b'); // FULLWIDTH SEMICOLON normalizes to ';' - set.add('\uff1c'); // FULLWIDTH LESS-THAN SIGN normalizes to '<' - set.add('\uff1d'); // FULLWIDTH EQUALS SIGN normalizes to '=' - set.add('\uff1e'); // FULLWIDTH GREATER-THAN SIGN normalizes to '>' - set.add('\uff40'); // FULLWIDTH GRAVE ACCENT normalizes to '`' - BAD_NORMALIZATION = Collections.unmodifiableSet(set); + // These characters all decompose riskily + String singles = "\u037e\u1fef\u203c\u207a\u208a\u2100\u2101\u2105\u2106\u2260\u226e\u226f\u33c2\u33c7\u33d8\ufb29\ufe10\ufe19\ufe30\ufe47\ufe48\ufe52"; + for(char ch : singles.toCharArray()) { + set.add(ch); + } + + // This string is composed of pairs of characters defining inclusive start and end ranges. + String pairs = + "\u2024\u2026\u2047\u2049\u207c\u207e\u208c\u208e\u2474\u24b5\u2a74\u2a76\u3200\u321e\u3220\u3243\ufe13\ufe16\ufe33" + + "\ufe38\ufe4d\ufe50\ufe54\ufe57\ufe59\ufe5c\ufe5f\ufe66\ufe68\ufe6b\uff01\uff0f\uff1a\uff20\uff3b\uff40\uff5b\uff5e"; + for(int i=0;i<pairs.length();i+=2) { + for(char ch=pairs.charAt(i);ch<=pairs.charAt(i+1);ch++) { + set.add(ch); + } + } + + RISKY_NORMALIZATION = Collections.unmodifiableSet(set); } } diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java index cecd4e06..86223db3 100644 --- a/src/test/java/org/owasp/html/EncodingTest.java +++ b/src/test/java/org/owasp/html/EncodingTest.java @@ -28,6 +28,11 @@ package org.owasp.html; +import java.io.IOException; +import java.text.Normalizer; +import java.text.Normalizer.Form; +import java.util.HashSet; + import org.junit.Test; import junit.framework.TestCase; @@ -207,6 +212,29 @@ public static final void testDecodeHtml() { assertEquals( "&bogus;", Encoding.decodeHtml("&bogus;")); + + assertEquals( + "lt<", + Encoding.decodeHtml("lt<")); + assertEquals( + "ltlt;", + Encoding.decodeHtml("ltlt;")); + assertEquals( + "lt<", + Encoding.decodeHtml("lt&lt;")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<")); + + assertEquals( + "lt&<gt", + Encoding.decodeHtml("\ufdddlt&&l\ufffet;\udc9c\ud835gt")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<\udc9c")); + assertEquals( + "lt&<", + Encoding.decodeHtml("lt&<\ud835")); } @Test @@ -344,4 +372,65 @@ void testBadlyDonePostProcessingWillnotAllowInsertingNonceAttributes() assertEquals("a nonce=xyz ", attrib.toString()); } + @Test + public static final void testRiskyNormalizationSetContents() { + // Test that the risky normalization set contains the expected values + for(char toTest='\u0080'; toTest<'\ufffe'; toTest++) { + boolean isRisky = false; + String decomposed = Normalizer.normalize(Character.toString(toTest), Form.NFKD); + for(int i=0;i<decomposed.length();i++) { + char ch = decomposed.charAt(i); + if( (' '<ch && ch<'0') || ('9'<ch && ch<'A') || ('Z'<ch && ch<'a') || ('z'<ch && ch<'\u007f') ) { + // Contains a non-alpha-numeric ASCII printable character, so we consider it a risky decomposition. + isRisky = true; + break; + } + } + + if( isRisky ) { + assertTrue(Encoding.RISKY_NORMALIZATION.contains(toTest)); + } else { + assertFalse(Encoding.RISKY_NORMALIZATION.contains(toTest)); + } + } + } + + + @Test + public static final void testRiskyNormalization() throws IOException { + StringBuilder attrib = new StringBuilder(); + Encoding.encodeRcdataOnto("Small Less-than Sign : \ufe64",attrib); + assertEquals("Small Less-than Sign : ﹤",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("Fullwidth Quotation Mark : \uff02",attrib); + assertEquals("Fullwidth Quotation Mark : "",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("Greek Varia : \u1fef",attrib); + assertEquals("Greek Varia : `",attrib.toString()); + } + + @Test + public static final void testNewLineNormalization() throws IOException { + StringBuilder attrib = new StringBuilder(); + Encoding.encodeRcdataOnto("\rone\ntwo\r",attrib); + assertEquals("\none\ntwo\n",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("\none\rtwo\n",attrib); + assertEquals("\none\ntwo\n",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("\r\none\r\ntwo\r\n",attrib); + assertEquals("\none\ntwo\n",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("\n\rone\n\rtwo\n\r",attrib); + assertEquals("\n\none\n\ntwo\n\n",attrib.toString()); + + attrib.setLength(0); + Encoding.encodeRcdataOnto("\r\rone\n\ntwo\r\r",attrib); + assertEquals("\n\none\n\ntwo\n\n",attrib.toString()); + } } From 5d711d09d3537144dae0e24ce2bbc3bf70384839 Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 21:01:50 +0000 Subject: [PATCH 4/8] Remove unwanted imports --- src/main/java/org/owasp/html/Encoding.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 10c037af..cef1d869 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -29,8 +29,6 @@ package org.owasp.html; import java.io.IOException; -import java.text.Normalizer; -import java.text.Normalizer.Form; import java.util.Collections; import java.util.HashSet; import java.util.Set; From 2964e5916c433b56b155971b7c463deb14a78732 Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 21:07:08 +0000 Subject: [PATCH 5/8] Fixing whitespace in Encoding --- src/main/java/org/owasp/html/Encoding.java | 64 ++++++++++------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index cef1d869..8b5e3b41 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -47,9 +47,7 @@ public final class Encoding { public static String decodeHtml(String s) { int firstAmp = s.indexOf('&'); int safeLimit = longestPrefixOfGoodCodeunits(s); - if ((firstAmp & safeLimit) < 0) { - return s; - } + if ((firstAmp & safeLimit) < 0) { return s; } StringBuilder sb; { @@ -69,9 +67,9 @@ public static String decodeHtml(String s) { stripBannedCodeunits( sb, firstAmp < 0 - ? safeLimit : safeLimit < 0 - ? firstAmp : Math.min(firstAmp, safeLimit) - ); + ? safeLimit : safeLimit < 0 + ? firstAmp : Math.min(firstAmp, safeLimit)); + return sb.toString(); } @@ -110,8 +108,8 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) { } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { - if (i + 1 < n) { - char next = sb.charAt(i + 1); + if (i+1 < n) { + char next = sb.charAt(i+1); if (Character.isSurrogatePair(ch, next)) { // The last two code points in each plane are non-characters that should be elided. if ((ch & 0xfc3f) != 0xd83f || (next & 0xfffe) != 0xdffe) { @@ -181,10 +179,10 @@ private static int longestPrefixOfGoodCodeunits(String s) { * the appended chunk as part of an attribute or tag boundary. * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * ends after the open-quote of an attribute value and does not yet contain - * a corresponding close quote. - * Modified in place. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * ends after the open-quote of an attribute value and does not yet contain + * a corresponding close quote. + * Modified in place. */ static void encodeHtmlAttribOnto(String plainText, Appendable output) throws IOException { @@ -203,11 +201,11 @@ static void encodeHtmlAttribOnto(String plainText, Appendable output) * step 4</a>.) * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * would leave an HTML parser in the Data state if it were to encounter a space - * character as the next character. In practice this means that the buffer - * does not contain partial tags or comments, and does not have an unclosed - * element with a special content model. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * would leave an HTML parser in the Data state if it were to encounter a space + * character as the next character. In practice this means that the buffer + * does not contain partial tags or comments, and does not have an unclosed + * element with a special content model. */ static void encodePcdataOnto(String plainText, Appendable output) throws IOException { @@ -230,11 +228,11 @@ static void encodePcdataOnto(String plainText, Appendable output) * {@code <title>} element outside foreign content. * * @param plainText text/plain - * @param output a buffer of text/html that has a well-formed HTML prefix that - * would leave an HTML parser in the Data state if it were to encounter a space - * character as the next character. In practice this means that the buffer - * does not contain partial tags or comments, and the most recently opened - * element is `<textarea>` or `<title>` and that element is still open. + * @param output a buffer of text/html that has a well-formed HTML prefix that + * would leave an HTML parser in the Data state if it were to encounter a space + * character as the next character. In practice this means that the buffer + * does not contain partial tags or comments, and the most recently opened + * element is `<textarea>` or `<title>` and that element is still open. */ public static void encodeRcdataOnto(String plainText, Appendable output) throws IOException { @@ -256,9 +254,8 @@ public static void encodeRcdataOnto(String plainText, Appendable output) */ @TCB private static void encodeHtmlOnto( - String plainText, Appendable output, @Nullable String braceReplacement - ) - throws IOException { + String plainText, Appendable output, @Nullable String braceReplacement) + throws IOException { int n = plainText.length(); int pos = 0; for (int i = 0; i < n; ++i) { @@ -367,17 +364,16 @@ static void appendNumericEntity(int codepoint, Appendable output) } } // """ is shorter than """ - REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. - REPLACEMENTS['&'] = "&"; // HTML special. + REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. + REPLACEMENTS['&'] = "&"; // HTML special. // We don't use ' since that is not in the intersection of HTML&XML. REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. - REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. - REPLACEMENTS['<'] = "<"; // HTML special. - REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. - REPLACEMENTS['>'] = ">"; // HTML special. - REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. - REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. - REPLACEMENTS['\u007f'] = ""; // Elide DELETE + REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. + REPLACEMENTS['<'] = "<"; // HTML special. + REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. + REPLACEMENTS['>'] = ">"; // HTML special. + REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. + REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. } /** From 594cc4d84a4eaedef02ca9165b477934c3f655fb Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 23:42:42 +0000 Subject: [PATCH 6/8] Delete was missing from the required character elisions --- src/main/java/org/owasp/html/Encoding.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 8b5e3b41..94fbde98 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -374,6 +374,7 @@ static void appendNumericEntity(int codepoint, Appendable output) REPLACEMENTS['>'] = ">"; // HTML special. REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. + REPLACEMENTS['\u007f'] = ""; // Elide delete } /** From 121c6c0628c725528c358904e83228059c62f4f5 Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 23:54:12 +0000 Subject: [PATCH 7/8] Increased Guava version to 30.1 to avoid security issue --- parent/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parent/pom.xml b/parent/pom.xml index 7ef44a68..e80abc22 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -88,7 +88,7 @@ application while protecting against XSS. <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> - <guava.version>27.1-jre</guava.version> + <guava.version>30.1-jre</guava.version> </properties> <build> From d78fc8a562bda505e0f5e35e005b27f1be7f5f9d Mon Sep 17 00:00:00 2001 From: Simon Greatrix <simon.greatrix@setl.io> Date: Sun, 7 Feb 2021 23:59:21 +0000 Subject: [PATCH 8/8] Additional usage of guava 27.1-jre replaced with 30.1-jre --- scripts/build_for_travis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_for_travis.sh b/scripts/build_for_travis.sh index 5400a2f5..1aec8fd0 100755 --- a/scripts/build_for_travis.sh +++ b/scripts/build_for_travis.sh @@ -35,7 +35,7 @@ if [ -n "$IS_LEGACY" ]; then else # Build the whole kit-n-kaboodle. mvn -f aggregate/pom.xml source:jar javadoc:jar verify $COMMON_FLAGS \ - && mvn -Dguava.version=27.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \ + && mvn -Dguava.version=30.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \ && mvn jacoco:report coveralls:report \ && mvn org.sonatype.ossindex.maven:ossindex-maven-plugin:audit -f aggregate $COMMON_FLAGS fi