Skip to content

Commit

Permalink
Merge d78fc8a into 020d5d0
Browse files Browse the repository at this point in the history
  • Loading branch information
simon-greatrix committed Feb 7, 2021
2 parents 020d5d0 + d78fc8a commit 2d10ece
Show file tree
Hide file tree
Showing 8 changed files with 416 additions and 165 deletions.
2 changes: 1 addition & 1 deletion parent/pom.xml
Expand Up @@ -88,7 +88,7 @@ application while protecting against XSS.
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<guava.version>27.1-jre</guava.version>
<guava.version>30.1-jre</guava.version>
</properties>

<build>
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_for_travis.sh
Expand Up @@ -35,7 +35,7 @@ if [ -n "$IS_LEGACY" ]; then
else
# Build the whole kit-n-kaboodle.
mvn -f aggregate/pom.xml source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn -Dguava.version=27.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn -Dguava.version=30.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn jacoco:report coveralls:report \
&& mvn org.sonatype.ossindex.maven:ossindex-maven-plugin:audit -f aggregate $COMMON_FLAGS
fi
219 changes: 116 additions & 103 deletions src/main/java/org/owasp/html/Encoding.java
Expand Up @@ -29,7 +29,9 @@
package org.owasp.html;

import java.io.IOException;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import javax.annotation.Nullable;

/** Encoders and decoders for HTML. */
Expand Down Expand Up @@ -94,6 +96,7 @@ static void stripBannedCodeunits(StringBuilder sb) {
stripBannedCodeunits(sb, 0);
}


@TCB
private static void stripBannedCodeunits(StringBuilder sb, int start) {
int k = start;
Expand All @@ -108,13 +111,16 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) {
if (i+1 < n) {
char next = sb.charAt(i+1);
if (Character.isSurrogatePair(ch, next)) {
sb.setCharAt(k++, ch);
sb.setCharAt(k++, next);
// The last two code points in each plane are non-characters that should be elided.
if ((ch & 0xfc3f) != 0xd83f || (next & 0xfffe) != 0xdffe) {
sb.setCharAt(k++, ch);
sb.setCharAt(k++, next);
}
++i;
}
}
continue;
} else if ((ch & 0xfffe) == 0xfffe) {
} else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) {
continue;
}
}
Expand All @@ -139,19 +145,34 @@ private static int longestPrefixOfGoodCodeunits(String s) {
}
} else if (0xd800 <= ch) {
if (ch <= 0xdfff) {
if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
++i; // Skip over low surrogate since we know it's ok.
if (i + 1 < n ) {
// could be a surrogate pair
char cn = s.charAt(i+1);
if( Character.isSurrogatePair(ch,cn) ) {
int cp = Character.toCodePoint(ch, cn);
// Could be a non-character
if ((cp & 0xfffe) == 0xfffe) {
// not valid
return i;
}

// skip over trailing surrogate since we know it is OK
i++;
} else {
// not a surrogate pair
return i;
}
} else {
// isolated surrogate at end of string
return i;
}
} else if ((ch & 0xfffe) == 0xfffe) {
} else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) {
return i;
}
}
}
return -1;
}

/**
* Appends an encoded form of plainText to output where the encoding is
* sufficient to prevent an HTML parser from interpreting any characters in
Expand Down Expand Up @@ -196,6 +217,7 @@ static void encodePcdataOnto(String plainText, Appendable output)
encodeHtmlOnto(plainText, output, "{<!-- -->");
}


/**
* Appends an encoded form of plainText to putput where the encoding is
* sufficient to prevent an HTML parser from transitioning out of the
Expand Down Expand Up @@ -240,127 +262,94 @@ private static void encodeHtmlOnto(
char ch = plainText.charAt(i);
if (ch < REPLACEMENTS.length) { // Handles all ASCII.
String repl = REPLACEMENTS[ch];
if (ch == '{' && repl == null) {
if (i + 1 == n || plainText.charAt(i + 1) == '{') {
repl = braceReplacement;
if( repl==null ) {
if (ch == '{') {
if (i + 1 == n || plainText.charAt(i + 1) == '{') {
// "{{" detected, so use the brace replacement
repl = braceReplacement;
}
}
if (ch == '\r') {
// If this CR is followed by a LF, just remove it. Otherwise replace it with a LF.
if (i + 1 == n || plainText.charAt(i + 1) != '\n' ) {
// CR not followed by LF, so turn into LF
repl = "\n";
} else {
// CRLF, so remove CR
repl = "";
}
}
}
if (repl != null) {
output.append(plainText, pos, i).append(repl);
pos = i + 1;
}
} else if ((0x93A <= ch && ch <= 0xC4C)
&& (
// Devanagari vowel
ch <= 0x94F
// Benagli vowels
|| 0x985 <= ch && ch <= 0x994
|| 0x9BE <= ch && ch < 0x9CC // 0x9CC (Bengali AU) is ok
|| 0x9E0 <= ch && ch <= 0x9E3
// Telugu vowels
|| 0xC05 <= ch && ch <= 0xC14
|| 0xC3E <= ch && ch != 0xC48 /* 0xC48 (Telugu AI) is ok */)) {
// https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/
// > So, ultimately, the full set of cases that cause the crash are:
// > Any sequence <consonant1, virama, consonant2, ZWNJ, vowel>
// > in Devanagari, Bengali, and Telugu, where: ...

// TODO: This is needed as of February 2018, but hopefully not long after that.
// We eliminate the ZWNJ which seems the minimally damaging thing to do to
// Telugu rendering per the article above:
// > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts.

if (pos < i) {
if (plainText.charAt(i - 1) == 0x200C /* ZWNJ */) {
output.append(plainText, pos, i - 1);
// Drop the ZWNJ on the floor.
pos = i;
}
} else if (output instanceof StringBuilder) {
StringBuilder sb = (StringBuilder) output;
int len = sb.length();
if (len != 0) {
if (sb.charAt(len - 1) == 0x200C /* ZWNJ */) {
sb.setLength(len - 1);
}
}
}
} else if (((char) 0xd800) <= ch) {
if (ch <= ((char) 0xdfff)) {
char next;
if (i + 1 < n
&& Character.isSurrogatePair(
ch, next = plainText.charAt(i + 1))) {
// Emit supplemental codepoints as entity so that they cannot
// be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
// and get involved in UTF-16/UCS-2 confusion.
int codepoint = Character.toCodePoint(ch, next);
output.append(plainText, pos, i);
} else if (RISKY_NORMALIZATION.contains(ch)) {
// Application of unicode compatibility normalization produces a risky character.
output.append(plainText, pos, i);
pos = i + 1;
appendNumericEntity(ch,output);
} else if ((ch <= 0x9f) || (0xfdd0 <= ch && ch <= 0xfdef) || ((ch & 0xfffe) == 0xfffe)) {
// Elide C1 escapes and BMP non-characters.
output.append(plainText, pos, i);
pos = i + 1;
} else if (0xd800 <= ch && ch <= 0xdfff) {
// handle surrogates
char next;
if (i + 1 < n && Character.isSurrogatePair(ch, next = plainText.charAt(i + 1))) {
// Emit supplemental codepoints as entity so that they cannot
// be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
// and get involved in UTF-16/UCS-2 confusion.
int codepoint = Character.toCodePoint(ch, next);
output.append(plainText, pos, i);
// do not append 0xfffe and 0xffff from any plane
if( (codepoint & 0xfffe) != 0xfffe ) {
appendNumericEntity(codepoint, output);
++i;
pos = i + 1;
} else {
output.append(plainText, pos, i);
// Elide the orphaned surrogate.
pos = i + 1;
}
} else if (0xfe60 <= ch) {
// Is a control character or possible full-width version of a
// special character, a BOM, or one of the FE60 block that might
// be elided or normalized to an HTML special character.
// Running
// cat NormalizationText.txt \
// | perl -pe 's/ ?#.*//' \
// | egrep '(;003C(;|$)|003E|0026|0022|0027|0060)'
// dumps a list of code-points that can normalize to HTML special
// characters.
++i;
pos = i + 1;
} else {
output.append(plainText, pos, i);
// Elide the orphaned surrogate.
pos = i + 1;
if ((ch & 0xfffe) == 0xfffe) {
// Elide since not an the XML Character.
} else {
appendNumericEntity(ch, output);
}
}
} else if (ch == '\u1FEF') { // Normalizes to backtick.
output.append(plainText, pos, i).append("&#8175;");
pos = i + 1;
}
}
output.append(plainText, pos, n);
}


/**
* Append a codepoint to the output as a numeric entity.
*
* @param codepoint the codepoint
* @param output the output
*
* @throws IOException if the output cannot be written to
* @throws IllegalArgumentException if the codepoint cannot be represented as a numeric escape.
*/
@TCB
static void appendNumericEntity(int codepoint, Appendable output)
throws IOException {
if (((codepoint <= 0x1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) {
throw new IllegalArgumentException("Illegal numeric escape. Cannot represent control code: " + codepoint);
}
if ((0xfdd0 <= codepoint && codepoint <= 0xfdef) || ((codepoint & 0xfffe) == 0xfffe)) {
throw new IllegalArgumentException("Illegal numeric escape. Cannot represent non-character: " + codepoint);
}

output.append("&#");
if (codepoint < 100) {
// TODO: is this dead code due to REPLACEMENTS above.
if (codepoint < 10) {
output.append((char) ('0' + codepoint));
} else {
output.append((char) ('0' + (codepoint / 10)));
output.append((char) ('0' + (codepoint % 10)));
}
// Below 100, a decimal representation is shortest
output.append(Integer.toString(codepoint));
} else {
int nDigits = (codepoint < 0x1000
? codepoint < 0x100 ? 2 : 3
: (codepoint < 0x10000 ? 4
: codepoint < 0x100000 ? 5 : 6));
// Append a hexadecimal value
output.append('x');
for (int digit = nDigits; --digit >= 0;) {
int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
output.append(HEX_NUMERAL[hexDigit]);
}
output.append(Integer.toHexString(codepoint));
}
output.append(";");
}

private static final char[] HEX_NUMERAL = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
};

/** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
private static final String[] REPLACEMENTS = new String[0x80];
static {
Expand All @@ -385,17 +374,41 @@ static void appendNumericEntity(int codepoint, Appendable output)
REPLACEMENTS['>'] = "&gt;"; // HTML special.
REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation.
REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter.
REPLACEMENTS['\u007f'] = ""; // Elide delete
}

/**
* IS_BANNED_ASCII[i] where is an ASCII control character codepoint (&lt; 0x20)
* is true for control characters that are not allowed in an XML source text.
*/
private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
private static final boolean[] IS_BANNED_ASCII = new boolean[0x20];
static {
for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
}
}

/** Set of all Unicode characters which when processed with unicode compatibility decomposition will include a non-alphanumeric ascii character. */
static final Set<Character> RISKY_NORMALIZATION;
static {
HashSet<Character> set = new HashSet<Character>();

// These characters all decompose riskily
String singles = "\u037e\u1fef\u203c\u207a\u208a\u2100\u2101\u2105\u2106\u2260\u226e\u226f\u33c2\u33c7\u33d8\ufb29\ufe10\ufe19\ufe30\ufe47\ufe48\ufe52";
for(char ch : singles.toCharArray()) {
set.add(ch);
}

// This string is composed of pairs of characters defining inclusive start and end ranges.
String pairs =
"\u2024\u2026\u2047\u2049\u207c\u207e\u208c\u208e\u2474\u24b5\u2a74\u2a76\u3200\u321e\u3220\u3243\ufe13\ufe16\ufe33"
+ "\ufe38\ufe4d\ufe50\ufe54\ufe57\ufe59\ufe5c\ufe5f\ufe66\ufe68\ufe6b\uff01\uff0f\uff1a\uff20\uff3b\uff40\uff5b\uff5e";
for(int i=0;i<pairs.length();i+=2) {
for(char ch=pairs.charAt(i);ch<=pairs.charAt(i+1);ch++) {
set.add(ch);
}
}

RISKY_NORMALIZATION = Collections.unmodifiableSet(set);
}
}
21 changes: 16 additions & 5 deletions src/main/java/org/owasp/html/HtmlLexer.java
Expand Up @@ -527,7 +527,7 @@ private HtmlToken parseToken() {
break;
}
}
} else if (!Character.isWhitespace(ch)) {
} else if (!isAsciiWhitespace(ch)) {
type = HtmlTokenType.TEXT;
for (; end < limit; ++end) {
ch = input.charAt(end);
Expand All @@ -538,12 +538,12 @@ private HtmlToken parseToken() {
&& '>' == input.charAt(end + 1)) {
break;
} else if ('>' == ch || '=' == ch
|| Character.isWhitespace(ch)) {
|| isAsciiWhitespace(ch)) {
break;
} else if ('"' == ch || '\'' == ch) {
if (end + 1 < limit) {
char ch2 = input.charAt(end + 1);
if (Character.isWhitespace(ch2)
if (isAsciiWhitespace(ch2)
|| ch2 == '>' || ch2 == '/') {
++end;
break;
Expand All @@ -554,7 +554,7 @@ private HtmlToken parseToken() {
} else {
// We skip whitespace tokens inside tag bodies.
type = HtmlTokenType.IGNORABLE;
while (end < limit && Character.isWhitespace(input.charAt(end))) {
while (end < limit && isAsciiWhitespace(input.charAt(end))) {
++end;
}
}
Expand Down Expand Up @@ -604,7 +604,7 @@ private HtmlToken parseToken() {
ch = input.charAt(end);
switch (state) {
case TAGNAME:
if (Character.isWhitespace(ch)
if (isAsciiWhitespace(ch)
|| '>' == ch || '/' == ch || '<' == ch) {
// End processing of an escape exempt block when we see
// a corresponding end tag.
Expand Down Expand Up @@ -749,6 +749,17 @@ private String canonicalElementName(int start, int end) {
return HtmlLexer.canonicalElementName(input.substring(start, end));
}

/**
* Test if a character is an ASCII whitespace according to the HTML rules. Other Unicode whitespace characters do not count.
*
* @param ch the character to test
*
* @return true if it is one of TAB, LF, FF, CR or SPACE
*/
private static boolean isAsciiWhitespace(int ch) {
return (ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\r') || (ch == '\f');
}

private static boolean isIdentStart(char ch) {
return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
}
Expand Down

0 comments on commit 2d10ece

Please sign in to comment.