Skip to content

Commit

Permalink
Merge branch 'master' into issue46
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jul 13, 2017
2 parents cad5b0f + bb5a820 commit 8ec0f83
Show file tree
Hide file tree
Showing 23 changed files with 309 additions and 111 deletions.
6 changes: 4 additions & 2 deletions config/config_docker/ache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
# Example of configuration for running a Focused Crawl
#

# Configure indexing in Elasticsearch container
target_storage.data_format.type: ELASTICSEARCH
# Configure ELASTICSEARCH and FILES data formats
target_storage.data_formats:
- ELASTICSEARCH
- FILES
target_storage.data_format.elasticsearch.rest.hosts:
- http://elasticsearch:9200

Expand Down
15 changes: 11 additions & 4 deletions config/sample_config/ache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,24 @@ target_storage.data_format.files.max_file_size: 134217728 # 128mb in bytes
#target_storage.data_format.elasticsearch.rest.socket_timeout: 30000
#target_storage.data_format.elasticsearch.rest.max_retry_timeout_millis: 90000


# Instead of configuring a single data format, you can also configure multiple
# data formats in a list as follows. The settings for each data format should
# be configured independently, as if you were configuring a single data format.
# In the following config, data will be pushed to both FILES and ELASTICSEARCH.
#
#target_storage.data_formats:
# - FILES
# - ELASTICSEARCH


# Performs hard focus or soft focus. When hard focus is enabled,
# the crawler only follows links from pages classified as relevant
target_storage.hard_focus: true

# Run bipartite crawler
target_storage.bipartite: false

# Relevance threshold for classified pages. Pages with probability of being
# relevant above this threshold are considered relevant
target_storage.relevance_threshold: 0.5

# Maximum number of pages to visit
target_storage.visited_page_limit: 10000000

Expand Down
16 changes: 7 additions & 9 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package focusedCrawler.link;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -97,16 +95,16 @@ else if(obj instanceof SitemapXmlHandler.SitemapData) {
}
return null;
}

public void insert(RobotsTxtHandler.RobotsData robotsData) {
if (disallowSitesInRobotsTxt) {
this.insertRobotRules(robotsData.link, robotsData.robotRules);
}
if (insertSiteMaps) {
for (String sitemap : robotsData.robotRules.getSitemaps()) {
try {
frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299));
} catch (Exception e) {
logger.error("Failed to insert sitemap from robot: " + sitemap);
}
}
Expand All @@ -116,17 +114,17 @@ public void insert(RobotsTxtHandler.RobotsData robotsData) {
public void insert(SitemapXmlHandler.SitemapData sitemapData) {
for (String link : sitemapData.links) {
try {
frontierManager.insert(new LinkRelevance(link, 1.0d, LinkRelevance.Type.FORWARD));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createForward(link, 1.0d));
} catch (Exception e) {
logger.error("Failed to insert link into the frontier: "+link);
}
}
logger.info("Added {} URLs from sitemap.", sitemapData.links.size());

for (String sitemap : sitemapData.sitemaps) {
try {
frontierManager.insert(new LinkRelevance(new URL(sitemap), 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299));
} catch (Exception e) {
logger.error("Failed to insert sitemap into the frontier: "+sitemap);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/focusedCrawler/link/LinkStorageConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public String getMozKey() {
private int schedulerHostMinAccessInterval = 5000;

@JsonProperty("link_storage.scheduler.max_links")
private int schedulerMaxLinks = 10000;
private int schedulerMaxLinks = 100000;

@JsonProperty("link_storage.persistent_hashtable.backend")
private String persistentHashtableBackend = "ROCKSDB";
Expand Down
15 changes: 13 additions & 2 deletions src/main/java/focusedCrawler/link/PolitenessScheduler.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ public void clear() {
*
*/
public class PolitenessScheduler {


private static final int MIN_LINKS_PER_DOMAIN_TO_ALLOW_LOAD = 2000;

private final PriorityQueue<DomainNode> domainsQueue;
private final PriorityQueue<DomainNode> emptyDomainsQueue;
private final Map<String, DomainNode> domains;
Expand Down Expand Up @@ -243,5 +245,14 @@ public boolean canDownloadNow(LinkRelevance link) {
return isAvailable(domain);
}
}


public boolean canInsertNow(LinkRelevance link) {
DomainNode domain = domains.get(link.getTopLevelDomainName());
if (domain == null) {
return true;
} else {
return domain.size() < MIN_LINKS_PER_DOMAIN_TO_ALLOW_LOAD;
}
}

}
43 changes: 23 additions & 20 deletions src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,32 +82,35 @@ private synchronized void loadQueue(int numberOfLinks) {
this.startSelection(numberOfLinks);

while (it.hasNext()) {
try {
LinkRelevance link = it.next().getValue();

LinkRelevance link = it.next().getValue();

if (frontier.isDisallowedByRobots(link)) {
continue;
}
if (frontier.isDisallowedByRobots(link)) {
continue;
}

// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
if (recrawlSelector != null) {
recrawlSelector.evaluateLink(link);
// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
if (recrawlSelector != null) {
recrawlSelector.evaluateLink(link);
}
continue;
}
continue;
}

uncrawledLinks++;
uncrawledLinks++;

// check whether link can be download now according to politeness constraints
if (scheduler.canDownloadNow(link)) {
// consider link to be downloaded
linkSelector.evaluateLink(link);
linksAvailable++;
} else {
rejectedLinks++;
// check whether link can be download now according to politeness constraints
if (scheduler.canInsertNow(link)) {
// consider link to be downloaded
linkSelector.evaluateLink(link);
linksAvailable++;
} else {
rejectedLinks++;
}
} catch (Exception e) {
// just log the exception and continue the load even when some link fails
logger.error("Failed to load link in frontier.", e);
}

}

this.addSelectedLinksToScheduler(recrawlSelector);
Expand Down
7 changes: 5 additions & 2 deletions src/main/java/focusedCrawler/link/frontier/Frontier.java
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,13 @@ public void update(LinkRelevance linkRelevance) {
* @throws FrontierPersistentException
*/
public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException {
if (linkRelev == null) {
return false;
}
boolean inserted = false;
String url = linkRelev.getURL().toString();
Integer rel = exist(linkRelev);
if (rel == null && url.toString().length() < 210) {
Integer relevance = exist(linkRelev);
if (relevance == null) {
urlRelevance.put(url, linkRelev);
inserted = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ public void insert(LinkRelevance[] linkRelevance) throws FrontierPersistentExcep
public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentException {
Context timerContext = insertTimer.time();
try {
if (linkRelevance == null) {
return false;
}
boolean insert = isRelevant(linkRelevance);
if (insert) {
if (downloadRobots) {
Expand All @@ -140,8 +143,8 @@ public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentExce
if (!hostsManager.isKnown(hostName)) {
hostsManager.insert(hostName);
try {
URL robotUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
LinkRelevance sitemap = new LinkRelevance(robotUrl, 299, LinkRelevance.Type.ROBOTS);
URL robotsUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
LinkRelevance sitemap = LinkRelevance.createRobots(robotsUrl.toString(), 299);
frontier.insert(sitemap);
} catch (Exception e) {
logger.warn("Failed to insert robots.txt for host: " + hostName, e);
Expand Down
48 changes: 45 additions & 3 deletions src/main/java/focusedCrawler/link/frontier/LinkRelevance.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Comparator;
import java.util.regex.Pattern;

import org.apache.commons.validator.routines.UrlValidator;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.core.JsonParser;
Expand All @@ -46,6 +49,11 @@ public class LinkRelevance implements Serializable {
public static double DEFAULT_HUB_RELEVANCE = 100;
public static double DEFAULT_AUTH_RELEVANCE = 200;

private static final UrlValidator validator = new UrlValidator(new String[] {"http","https"});
// .onion links aren't accepted by the validator
// Regex ".[^.]+" --> any string of at least 1 char without dot
private static final Pattern onionPattern = Pattern.compile("https?://.[^.]+\\.onion.*");

public enum Type {
FORWARD, ROBOTS, SITEMAP
}
Expand Down Expand Up @@ -73,15 +81,15 @@ public LinkRelevance() {
// required for JSON serialization
}

public LinkRelevance(String string, double relevance) throws MalformedURLException {
this(new URL(string), relevance);
public LinkRelevance(String url, double relevance) throws MalformedURLException {
this(new URL(url), relevance);
}

public LinkRelevance(URL url, double relevance) {
this(url, relevance, Type.FORWARD);
}

public LinkRelevance(String url, double relevance, Type type) throws MalformedURLException {
private LinkRelevance(String url, double relevance, Type type) throws MalformedURLException {
this(new URL(url), relevance, type);
}

Expand Down Expand Up @@ -154,4 +162,38 @@ public URL deserialize(JsonParser parser, DeserializationContext ctxt) throws IO
}
}

public static LinkRelevance createForward(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.FORWARD);
}
} catch (MalformedURLException e) {
}
return null;
}

public static LinkRelevance createSitemap(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.SITEMAP);
}
} catch (MalformedURLException e) {
}
return null;
}

public static LinkRelevance createRobots(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.ROBOTS);
}
} catch (MalformedURLException e) {
}
return null;
}

private static boolean isValid(String url) {
return validator.isValid(url) || onionPattern.matcher(url).matches();
}

}
Loading

0 comments on commit 8ec0f83

Please sign in to comment.