Skip to content

Commit

Permalink
Merge pull request #109 from maqzi/rewriteFetcher
Browse files Browse the repository at this point in the history
#95: HTTP Fetcher rewritten using OKHTTP3
  • Loading branch information
aecio committed Aug 9, 2017
2 parents 779f155 + 31ede49 commit 484de1a
Show file tree
Hide file tree
Showing 8 changed files with 702 additions and 6 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ dependencies {
//compile group: 'com.amazonaws', name: 'aws-java-sdk', version: '1.0.12'
compile group: 'com.amazonaws', name: 'aws-java-sdk', version: '1.11.150'
compile 'org.jsoup:jsoup:1.10.3'
compile 'com.squareup.okhttp3:okhttp:3.8.1'

// REST server dependencies
compile "com.sparkjava:spark-core:2.5.3"
Expand All @@ -61,6 +62,7 @@ dependencies {
testCompile 'junit:junit:4.12'
testCompile 'org.hamcrest:hamcrest-all:1.3'
testCompile 'org.eclipse.jetty:jetty-server:9.3.6.v20151106' // for tests of crawler commons library fork
testCompile 'com.squareup.okhttp3:mockwebserver:3.8.1'

// TODO fill these in from Maven Central instead of that local libs dir
compile files('libs/jsonic-1.2.0.jar') // recommend updating this, it's for langdetect.jar
Expand Down
3 changes: 3 additions & 0 deletions config/sample_config/ache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,5 +146,8 @@ crawler_manager.downloader.valid_mime_types:
- application/xhtml+xml
- application/vnd.wap.xhtml+xml

#use okhttp3Fetcher instead of simpleHttpFetcher
crawler_manager.downloader.use_okhttp3_fetcher: true

# Discovery of new links using sitemap.xml protocol
link_storage.download_sitemap_xml: false
9 changes: 9 additions & 0 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ Is there any limit on number of crawled webpages per website?
There is no limit by default, but you can set a hard limit in the configuration file using the key ``link_storage.max_pages_per_domain``.
You can enable this so that the crawler doesn't get trapped by particular domains, and to favor crawling a larger number of domains as opposed to focusing on a few domains.

Why am I getting a *SSL Handshake Exception* for some sites?
----------------------------------------------------------------------------------------------
A ``javax.net.ssl.SSLHandshakeException : handshake_failure`` usually occurs when the server and ache can't decide on which Cipher to use. This most probably happens when the JVM is using a limited security cipher suite. The easiest work around for this is to use OpenJDK 8+ because it comes with Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy out of the box. To install this JCE on Oracle, follow the instructions `here <https://github.com/ViDA-NYU/ache/issues/95>`_.

Why am I getting a *SSL Protocol Exception* for some sites?
---------------------------------------------------------------------------------------------
A ``javax.net.ssl.SSLProtocolException : unrecognized_name`` is a server misconfiguration issue. Most probably, this website is hosted on a virtual server. A simple solution is to disable SNI extension by adding ``-Djsse.enableSNIExtension=false`` as VM options when running Ache. However, keep in mind that disabling SNI will cause certificate validation failures for some sites which use mutiple hostnames behind a shared IP.


Where to report bugs?
---------------------
We welcome feedback. Please submit any suggestions or bug reports using the Github issue tracker (https://github.com/ViDA-NYU/ache/issues)
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ public static class Cookie {
public String path = "/";
}

@JsonProperty("crawler_manager.downloader.use_okhttp3_fetcher")
private String okHttpFetcher = null;

@JsonProperty("crawler_manager.downloader.download_thread_pool_size")
private int downloadThreadPoolSize = 100;
Expand Down Expand Up @@ -50,6 +52,12 @@ public HttpDownloaderConfig() {
// Required for de-serialization
}

public HttpDownloaderConfig(String okHttpFetcher){
if (okHttpFetcher.equals("okHttp")){
this.okHttpFetcher = "True";
}
}

public List<Cookie> getCookies() {
return this.cookies;
}
Expand Down Expand Up @@ -94,4 +102,7 @@ public String getUserAgentEmail() {
return userAgentEmail;
}

public String getOkHttpFetcher() {
return okHttpFetcher;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package focusedCrawler.crawler.async.fetcher;


import okhttp3.CipherSuite;

import java.util.Arrays;
import java.util.List;

public class CustomCipherSuites {

private static List<CipherSuite> customCipherSuites;

public CustomCipherSuites(){
customCipherSuites = Arrays.asList(
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_DHE_DSS_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_DHE_DSS_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_DHE_DSS_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_DHE_DSS_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_DHE_DSS_WITH_AES_256_CBC_SHA256,
CipherSuite.TLS_DHE_DSS_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_DHE_RSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_DHE_RSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_DHE_RSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_DHE_RSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_DHE_RSA_WITH_AES_256_CBC_SHA256,
CipherSuite.TLS_DHE_RSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_DH_anon_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_DH_anon_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_DH_anon_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_DH_anon_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_DH_anon_WITH_AES_256_CBC_SHA256,
CipherSuite.TLS_DH_anon_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384,
CipherSuite.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDHE_ECDSA_WITH_NULL_SHA,
CipherSuite.TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384,
CipherSuite.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDHE_RSA_WITH_NULL_SHA,
CipherSuite.TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384,
CipherSuite.TLS_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDH_ECDSA_WITH_NULL_SHA,
CipherSuite.TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_ECDH_RSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_ECDH_RSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_ECDH_RSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384,
CipherSuite.TLS_ECDH_RSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_ECDH_RSA_WITH_NULL_SHA,
CipherSuite.TLS_ECDH_anon_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_ECDH_anon_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_ECDH_anon_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_ECDH_anon_WITH_NULL_SHA,
CipherSuite.TLS_EMPTY_RENEGOTIATION_INFO_SCSV,
CipherSuite.TLS_KRB5_EXPORT_WITH_DES_CBC_40_MD5,
CipherSuite.TLS_KRB5_EXPORT_WITH_DES_CBC_40_SHA,
CipherSuite.TLS_KRB5_WITH_3DES_EDE_CBC_MD5,
CipherSuite.TLS_KRB5_WITH_3DES_EDE_CBC_SHA,
CipherSuite.TLS_KRB5_WITH_DES_CBC_MD5,
CipherSuite.TLS_KRB5_WITH_DES_CBC_SHA,
CipherSuite.TLS_RSA_WITH_AES_128_CBC_SHA,
CipherSuite.TLS_RSA_WITH_AES_128_CBC_SHA256,
CipherSuite.TLS_RSA_WITH_AES_128_GCM_SHA256,
CipherSuite.TLS_RSA_WITH_AES_256_CBC_SHA,
CipherSuite.TLS_RSA_WITH_AES_256_CBC_SHA256,
CipherSuite.TLS_RSA_WITH_AES_256_GCM_SHA384,
CipherSuite.TLS_RSA_WITH_NULL_SHA256);
}


// Really old and weak Cipher Suites.
// Will have to download them separately if the
// server uses them. Not many servers do

// CipherSuite.SSL_DHE_DSS_EXPORT_WITH_DES40_CBC_SHA,
// CipherSuite.SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA,
// CipherSuite.SSL_DHE_DSS_WITH_DES_CBC_SHA,
// CipherSuite.SSL_DHE_RSA_EXPORT_WITH_DES40_CBC_SHA,
// CipherSuite.SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA,
// CipherSuite.SSL_DHE_RSA_WITH_DES_CBC_SHA,
// CipherSuite.SSL_DH_anon_EXPORT_WITH_DES40_CBC_SHA,
// CipherSuite.SSL_DH_anon_WITH_3DES_EDE_CBC_SHA,
// CipherSuite.SSL_DH_anon_WITH_DES_CBC_SHA,
// CipherSuite.SSL_RSA_EXPORT_WITH_DES40_CBC_SHA,
// CipherSuite.SSL_RSA_WITH_3DES_EDE_CBC_SHA,
// CipherSuite.SSL_RSA_WITH_DES_CBC_SHA,
// CipherSuite.SSL_RSA_WITH_NULL_MD5,
// CipherSuite.SSL_RSA_WITH_NULL_SHA,


public List<CipherSuite> getCustomCipherSuites() {
return customCipherSuites;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ public class FetcherFactory {
public static BaseFetcher createFetcher(HttpDownloaderConfig config) {
if(config.getTorProxy() != null) {
return createTorProxyFetcher(config);
} else if(config.getOkHttpFetcher() != null){
return createOkHttpFetcher(config);
} else {
return createSimpleHttpFetcher(config);
}
Expand Down Expand Up @@ -100,4 +102,26 @@ public static CookieStore createCookieStore(HttpDownloaderConfig config) {
return store;
}

public static OkHttpFetcher createOkHttpFetcher(HttpDownloaderConfig config){
UserAgent userAgent = new UserAgent.Builder()
.setAgentName(config.getUserAgentName())
.setEmailAddress(config.getUserAgentEmail())
.setWebAddress(config.getUserAgentUrl())
.setUserAgentString(config.getUserAgentString())
.build();
int connectionPoolSize = config.getConnectionPoolSize();
OkHttpFetcher httpFetcher = new OkHttpFetcher(connectionPoolSize, userAgent);
httpFetcher.setMaxRedirects(config.getMaxRetryCount());
httpFetcher.setMaxConnectionsPerHost(1);
int defaultMaxContentSize = 51 * 1024 * 1024;
httpFetcher.setDefaultMaxContentSize(defaultMaxContentSize);

if(config.getValidMimeTypes() != null) {
for (String mimeTypes : config.getValidMimeTypes()) {
httpFetcher.addValidMimeType(mimeTypes);
}
}
return httpFetcher;
}

}
Loading

0 comments on commit 484de1a

Please sign in to comment.