From 7dca1304a0210b346873c0448d9247bfe189aec0 Mon Sep 17 00:00:00 2001 From: Ween Jiann Lee Date: Mon, 7 Jan 2019 15:08:01 +0800 Subject: [PATCH 1/2] Updated AsyncFetcher to expose more HttpAsyncClientBuilder options --- .../preferred/venom/fetcher/AsyncFetcher.java | 107 ++++++++++++++++-- 1 file changed, 99 insertions(+), 8 deletions(-) diff --git a/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java b/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java index de3cba4..4cbaed1 100644 --- a/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java +++ b/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java @@ -33,6 +33,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.http.HttpHost; import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.RedirectStrategy; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; @@ -50,6 +51,7 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import javax.net.ssl.SSLContext; import javax.validation.constraints.NotNull; import java.io.IOException; import java.net.URI; @@ -174,14 +176,21 @@ private AsyncFetcher(final Builder builder) { final IOReactorConfig reactorConfig = IOReactorConfig.custom() .setIoThreadCount(builder.numIoThreads) .setSoKeepAlive(true) - .setTcpNoDelay(true) .setConnectTimeout(builder.connectTimeout) .setSoTimeout(builder.socketTimeout) .build(); final HttpAsyncClientBuilder clientBuilder = HttpAsyncClientBuilder.create() .setDefaultIOReactorConfig(reactorConfig) - .setThreadFactory(builder.threadFactory); + .setThreadFactory(builder.threadFactory) + .setMaxConnPerRoute(builder.maxRouteConnections) + .setMaxConnTotal(builder.maxConnections) + .setSSLContext(builder.sslContext) + .setRedirectStrategy(builder.redirectStrategy); + + if (builder.disableCookies) { + clientBuilder.disableCookieManagement(); + } if (builder.compressed) { clientBuilder.addInterceptorLast(new RequestAcceptEncoding()); @@ -422,6 +431,11 @@ public static class Builder { */ private final List callbacks; + /** + * Determines whether cookie storage is allowed. + */ + private boolean disableCookies; + /** * The file manager used to store raw responses. */ @@ -437,11 +451,26 @@ public static class Builder { */ private int numIoThreads; + /** + * The maximum number of connections allowed. + */ + private int maxConnections; + + /** + * The maximum number of connections allowed per route. + */ + private int maxRouteConnections; + /** * The proxy provider for proxies. */ private ProxyProvider proxyProvider; + /** + * The SSL context for a response. + */ + private SSLContext sslContext; + /** * A list of status code to stop retry. */ @@ -462,6 +491,11 @@ public static class Builder { */ private Validator validator; + /** + * The redirection strategy for a response. + */ + private RedirectStrategy redirectStrategy; + /** * The validator router used. */ @@ -492,8 +526,11 @@ public static class Builder { */ private Builder() { callbacks = new ArrayList<>(); + disableCookies = false; fileManager = null; headers = Collections.emptyMap(); + maxConnections = 0; + maxRouteConnections = 0; numIoThreads = Runtime.getRuntime().availableProcessors(); proxyProvider = null; stopCodes = Collections.emptySet(); @@ -526,6 +563,16 @@ public Builder register(final @NotNull Callback callback) { return this; } + /** + * Disables cookie storage. + * + * @return this + */ + public Builder disableCookies() { + this.disableCookies = true; + return this; + } + /** * Sets the FileManager to be used. Defaults to none. *

@@ -562,10 +609,33 @@ public Builder numIoThreads(final int numIoThreads) { return this; } + /** + * Sets the maximum allowable connections at an instance. + * + * @param maxConnections the max allowable connections. + * @return this + */ + public Builder setMaxConnections(int maxConnections) { + this.maxConnections = maxConnections; + return this; + } + + /** + * Sets the maximum allowable connections at an instance for + * a particular route (host). + * + * @param maxRouteConnections the max allowable connections per route. + * @return this + */ + public Builder setMaxRouteConnections(int maxRouteConnections) { + this.maxRouteConnections = maxRouteConnections; + return this; + } + /** * Sets the ProxyProvider to be used. Defaults to none. * - * @param proxyProvider proxy provider to be used + * @param proxyProvider proxy provider to be used. * @return this */ public Builder proxyProvider(final @NotNull ProxyProvider proxyProvider) { @@ -573,10 +643,21 @@ public Builder proxyProvider(final @NotNull ProxyProvider proxyProvider) { return this; } + /** + * Sets the ssl context for an encrypted response. + * + * @param sslContext SSLContext to be used. + * @return this + */ + public Builder setSslContext(SSLContext sslContext) { + this.sslContext = sslContext; + return this; + } + /** * Set a list of stop code that will interrupt crawling. * - * @param codes A list of stop codes + * @param codes A list of stop codes. * @return this */ public Builder stopCodes(final int... codes) { @@ -643,6 +724,17 @@ public Builder validator(final @NotNull Validator... validators) { return this; } + /** + * Sets the redirection strategy for a response received by the fetcher. + * + * @param redirectStrategy redirection strategy to be used. + * @return this + */ + public Builder setRedirectStrategy(RedirectStrategy redirectStrategy) { + this.redirectStrategy = redirectStrategy; + return this; + } + /** * Sets ValidatorRouter to be used. Defaults to none. * Validator rules set in validator will always be used. @@ -694,14 +786,13 @@ public Builder socketTimeout(final int socketTimeout) { } /** - * Set whether to request for compress pages and to decompress pages + * Disables request for compress pages and to decompress pages * after it is fetched. Defaults to true. * - * @param compressed should request for compress pages * @return this */ - public Builder compressed(final boolean compressed) { - this.compressed = compressed; + public Builder disableCompression() { + this.compressed = false; return this; } From 3af8aa60c1ab5fff1a82f4cfa382406a1e30dd1b Mon Sep 17 00:00:00 2001 From: Ween Jiann Lee Date: Mon, 7 Jan 2019 15:18:04 +0800 Subject: [PATCH 2/2] Updated AsyncFetcher to to remove checkstyle warnings --- .../java/ai/preferred/venom/fetcher/AsyncFetcher.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java b/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java index 4cbaed1..7014014 100644 --- a/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java +++ b/src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java @@ -615,7 +615,7 @@ public Builder numIoThreads(final int numIoThreads) { * @param maxConnections the max allowable connections. * @return this */ - public Builder setMaxConnections(int maxConnections) { + public Builder setMaxConnections(final int maxConnections) { this.maxConnections = maxConnections; return this; } @@ -627,7 +627,7 @@ public Builder setMaxConnections(int maxConnections) { * @param maxRouteConnections the max allowable connections per route. * @return this */ - public Builder setMaxRouteConnections(int maxRouteConnections) { + public Builder setMaxRouteConnections(final int maxRouteConnections) { this.maxRouteConnections = maxRouteConnections; return this; } @@ -649,7 +649,7 @@ public Builder proxyProvider(final @NotNull ProxyProvider proxyProvider) { * @param sslContext SSLContext to be used. * @return this */ - public Builder setSslContext(SSLContext sslContext) { + public Builder setSslContext(final SSLContext sslContext) { this.sslContext = sslContext; return this; } @@ -730,7 +730,7 @@ public Builder validator(final @NotNull Validator... validators) { * @param redirectStrategy redirection strategy to be used. * @return this */ - public Builder setRedirectStrategy(RedirectStrategy redirectStrategy) { + public Builder setRedirectStrategy(final RedirectStrategy redirectStrategy) { this.redirectStrategy = redirectStrategy; return this; }