Skip to content
Permalink
Browse files

added preprocessing and post-processing options to HtmlPolicyBuilder …

…so that clients no longer are tempted to do search/replace on sanitized output
  • Loading branch information...
mikesamuel committed May 25, 2016
1 parent bb7f71e commit 6337aba5fc95d7080cf419a201564a48fb03973c
@@ -172,6 +172,10 @@
private final Set<String> skipIfEmpty = Sets.newLinkedHashSet(
DEFAULT_SKIP_IF_EMPTY);
private final Map<String, Boolean> textContainers = Maps.newLinkedHashMap();
private HtmlStreamEventProcessor postprocessor =
HtmlStreamEventProcessor.Processors.IDENTITY;
private HtmlStreamEventProcessor preprocessor =
HtmlStreamEventProcessor.Processors.IDENTITY;
private boolean requireRelNofollowOnLinks;

/**
@@ -445,6 +449,34 @@ public HtmlPolicyBuilder allowStyling(CssSchema whitelist) {
return this;
}

/**
* Inserts a pre-processor into the pipeline between the lexer and the policy.
* Pre-processors receive HTML events before the policy, so the policy will
* be applied to anything they add.
* Pre-processors are not in the TCB since they cannot bypass the policy.
*/
public HtmlPolicyBuilder withPreprocessor(HtmlStreamEventProcessor pp) {
this.preprocessor = HtmlStreamEventProcessor.Processors.compose(
this.preprocessor, pp);
return this;
}

/**
* Inserts a post-processor into the pipeline between the policy and the
* output sink.
* Post-processors can insert events into the stream that are not vetted
* by the policy, so they are in the TCB.
* <p>
* Try doing what you want with a pre-processor instead of a post-processor
* but if you're thinking of doing search/replace on a sanitized string, then
* definitely use either a pre or post-processor instead.
*/
public HtmlPolicyBuilder withPostprocessor(HtmlStreamEventProcessor pp) {
this.postprocessor = HtmlStreamEventProcessor.Processors.compose(
this.postprocessor, pp);
return this;
}

/**
* Names of attributes from HTML 4 whose values are URLs.
* Other attributes, e.g. <code>style</code> may contain URLs even though
@@ -499,7 +531,8 @@ public PolicyFactory toFactory() {
}
}
return new PolicyFactory(compilePolicies(), textContainerSet.build(),
ImmutableMap.copyOf(globalAttrPolicies));
ImmutableMap.copyOf(globalAttrPolicies),
preprocessor, postprocessor);
}

// Speed up subsequent builds by caching the compiled policies.
@@ -99,33 +99,41 @@
* {@link HtmlStreamRenderer} after filtering.
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
*/
public static void sanitize(@Nullable String html, final Policy policy) {
if (html == null) { html = ""; }

TagBalancingHtmlStreamEventReceiver balancer
= new TagBalancingHtmlStreamEventReceiver(policy);

// According to Opera the maximum table nesting depth seen in the wild is
// 795, but 99.99% of documents have a table nesting depth of less than 22.
// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
// document depth of 90 (incl. HTML & BODY).
// Obviously table nesting depth is not the same as whole document depth,
// but it is the best proxy I have available.
// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
// the original data.
public static void sanitize(
@Nullable String html, final Policy policy) {
sanitize(html, policy, HtmlStreamEventProcessor.Processors.IDENTITY);
}

// Webkit defines the maximum HTML parser tree depth as 512.
// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
/**
* Sanitizes the given HTML by applying the given policy to it.
*
* <p>
* This method is not in the TCB.
*
* <p>
* This method has no return value since policies are assumed to render things
* they accept and do nothing on things they reject.
* Use {@link HtmlStreamRenderer} to render content to an output buffer.
*
* @param html A snippet of HTML to sanitize. {@code null} is treated as the
* empty string and will not result in a {@code NullPointerException}.
* @param policy The Policy that will receive events based on the tokens in
* HTML. Typically, this policy ends up routing the events to an
* {@link HtmlStreamRenderer} after filtering.
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
* @param preprocessor A processor that may wrap the policy to reinterpret
* parse events.
* Since the policy encapsulates its output buffer, this is not in the
* policy's TCB.
*/
public static void sanitize(
@Nullable String html, final Policy policy,
HtmlStreamEventProcessor preprocessor) {
if (html == null) { html = ""; }

// The first number gives us a lower bound on the nesting depth we allow,
// 90, and the second gives us an upper bound: 512.
// We do not want to bump right up against that limit.
// 256 is substantially larger than the lower bound and well clear of the
// upper bound.
balancer.setNestingLimit(256);
HtmlStreamEventReceiver receiver = initializePolicy(policy, preprocessor);

balancer.openDocument();
receiver.openDocument();

HtmlLexer lexer = new HtmlLexer(html);
// Use a linked list so that policies can use Iterator.remove() in an O(1)
@@ -135,16 +143,16 @@ public static void sanitize(@Nullable String html, final Policy policy) {
HtmlToken token = lexer.next();
switch (token.type) {
case TEXT:
balancer.text(
receiver.text(
Encoding.decodeHtml(html.substring(token.start, token.end)));
break;
case UNESCAPED:
balancer.text(Encoding.stripBannedCodeunits(
receiver.text(Encoding.stripBannedCodeunits(
html.substring(token.start, token.end)));
break;
case TAGBEGIN:
if (html.charAt(token.start + 1) == '/') { // A close tag.
balancer.closeTag(HtmlLexer.canonicalName(
receiver.closeTag(HtmlLexer.canonicalName(
html.substring(token.start + 2, token.end)));
while (lexer.hasNext()
&& lexer.next().type != HtmlTokenType.TAGEND) {
@@ -182,7 +190,7 @@ public static void sanitize(@Nullable String html, final Policy policy) {
if (!attrsReadyForName) {
attrs.add(attrs.getLast());
}
balancer.openTag(
receiver.openTag(
HtmlLexer.canonicalName(
html.substring(token.start + 1, token.end)),
attrs);
@@ -195,7 +203,7 @@ public static void sanitize(@Nullable String html, final Policy policy) {
}
}

balancer.closeDocument();
receiver.closeDocument();
}

private static String stripQuotes(String encodedAttributeValue) {
@@ -216,4 +224,31 @@ private static String stripQuotes(String encodedAttributeValue) {
return encodedAttributeValue;
}


private static HtmlStreamEventReceiver initializePolicy(
Policy policy, HtmlStreamEventProcessor preprocessor) {
TagBalancingHtmlStreamEventReceiver balancer
= new TagBalancingHtmlStreamEventReceiver(policy);

// According to Opera the maximum table nesting depth seen in the wild is
// 795, but 99.99% of documents have a table nesting depth of less than 22.
// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
// document depth of 90 (incl. HTML & BODY).
// Obviously table nesting depth is not the same as whole document depth,
// but it is the best proxy I have available.
// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
// the original data.

// Webkit defines the maximum HTML parser tree depth as 512.
// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;

// The first number gives us a lower bound on the nesting depth we allow,
// 90, and the second gives us an upper bound: 512.
// We do not want to bump right up against that limit.
// 256 is substantially larger than the lower bound and well clear of the
// upper bound.
balancer.setNestingLimit(256);
return preprocessor.wrap(balancer);
}
}
@@ -0,0 +1,53 @@
package org.owasp.html;

/**
* Receives the output sink to allow user-code to post-process events.
*/
public interface HtmlStreamEventProcessor {
/**
* @param sink an HTML stream event receiver that can take events from a
* sanitizer policy to build a safe output on an appropriate buffer.
* @return an HTML stream event receiver that can take events from a
* sanitizer policy to build a safe output on an appropriate buffer by
* sending events to sink.
*/
HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink);

/** */
public static final class Processors {
/**
* A post-processor that returns the sink without wrapping it to do any
* additional work.
*/
public static final HtmlStreamEventProcessor IDENTITY =
new HtmlStreamEventProcessor() {

public HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink) {
return sink;
}

@Override
public String toString() {
return "[identity]";
}
};

/**
* @return a processor whose that wraps its input in f wrapped in g.
*/
public static HtmlStreamEventProcessor compose(
final HtmlStreamEventProcessor g, final HtmlStreamEventProcessor f) {
if (f == IDENTITY) { return g; }
if (g == IDENTITY) { return f; }
return new HtmlStreamEventProcessor() {
public HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink) {
return g.wrap(f.wrap(sink));
}
@Override
public String toString() {
return "(" + g + " \u2218 " + f + ")";
}
};
}
}
}
@@ -56,20 +56,26 @@
private final ImmutableMap<String, ElementAndAttributePolicies> policies;
private final ImmutableMap<String, AttributePolicy> globalAttrPolicies;
private final ImmutableSet<String> textContainers;
private final HtmlStreamEventProcessor preprocessor;
private final HtmlStreamEventProcessor postprocessor;

PolicyFactory(
ImmutableMap<String, ElementAndAttributePolicies> policies,
ImmutableSet<String> textContainers,
ImmutableMap<String, AttributePolicy> globalAttrPolicies) {
ImmutableMap<String, AttributePolicy> globalAttrPolicies,
HtmlStreamEventProcessor preprocessor,
HtmlStreamEventProcessor postprocessor) {
this.policies = policies;
this.textContainers = textContainers;
this.globalAttrPolicies = globalAttrPolicies;
this.preprocessor = preprocessor;
this.postprocessor = postprocessor;
}

/** Produces a sanitizer that emits tokens to {@code out}. */
public HtmlSanitizer.Policy apply(@Nonnull HtmlStreamEventReceiver out) {
return new ElementAndAttributePolicyBasedSanitizerPolicy(
out, policies, textContainers);
postprocessor.wrap(out), policies, textContainers);
}

/**
@@ -120,8 +126,11 @@ public String sanitize(@Nullable String html) {
StringBuilder out = new StringBuilder(html.length());
HtmlSanitizer.sanitize(
html,
apply(HtmlStreamRenderer.create(out, Handler.DO_NOTHING),
listener, context));
apply(
HtmlStreamRenderer.create(out, Handler.DO_NOTHING),
listener,
context),
preprocessor);
return out.toString();
}

@@ -193,6 +202,14 @@ public PolicyFactory and(PolicyFactory f) {
}
allGlobalAttrPolicies = ab.build();
}
return new PolicyFactory(b.build(), allTextContainers, allGlobalAttrPolicies);
HtmlStreamEventProcessor compositionOfPreprocessors
= HtmlStreamEventProcessor.Processors.compose(
this.preprocessor, f.preprocessor);
HtmlStreamEventProcessor compositionOfPostprocessors
= HtmlStreamEventProcessor.Processors.compose(
this.postprocessor, f.postprocessor);
return new PolicyFactory(
b.build(), allTextContainers, allGlobalAttrPolicies,
compositionOfPreprocessors, compositionOfPostprocessors);
}
}

0 comments on commit 6337aba

Please sign in to comment.
You can’t perform that action at this time.