Skip to content

Commit

Permalink
Replace uses of class BaseRobotRules by SimpleRobotRules
Browse files Browse the repository at this point in the history
Replace uses of class BaseRobotRules by SimpleRobotRules to
avoid problems with abstract class deserialization while caching
robots.txt rules in the database
  • Loading branch information
aecio committed May 14, 2018
1 parent 54fbd19 commit 807ca23
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 103 deletions.
17 changes: 8 additions & 9 deletions src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.BaseRobotsParser;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
Expand All @@ -19,18 +18,18 @@ public class RobotsTxtHandler implements HttpDownloader.Callback {
@SuppressWarnings("serial")
public static class RobotsData implements Serializable {

public BaseRobotRules robotRules;
public SimpleRobotRules robotRules;
public LinkRelevance link;

public RobotsData(LinkRelevance link, BaseRobotRules robotRules) {
public RobotsData(LinkRelevance link, SimpleRobotRules robotRules) {
this.link = link;
this.robotRules = robotRules;
}
}

private static final Logger logger = LoggerFactory.getLogger(RobotsTxtHandler.class);

private BaseRobotsParser parser = new SimpleRobotRulesParser();
private SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
private LinkStorage linkStorage;
private String userAgentName;

Expand Down Expand Up @@ -65,17 +64,17 @@ public void failed(LinkRelevance link, Exception e) {

private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {

BaseRobotRules robotRules;
SimpleRobotRules robotRules;
if(fetchFailed || response == null) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
}
else {
String contentType = response.getContentType();
boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
if ((response.getNumRedirects() > 0) && !isPlainText) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
} else {
robotRules = parser.parseContent(
robotRules = (SimpleRobotRules) parser.parseContent(
response.getFetchedUrl(),
response.getContent(),
response.getContentType(),
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;
import focusedCrawler.crawler.async.RobotsTxtHandler;
import focusedCrawler.crawler.async.SitemapXmlHandler;
import focusedCrawler.link.classifier.LinkClassifierFactory;
Expand Down Expand Up @@ -220,7 +220,7 @@ private static OnlineLearning createOnlineLearning(String dataPath, LinkStorageC
* @throws NullPointerException
* when either of the argument is null
*/
public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
public void insertRobotRules(LinkRelevance link, SimpleRobotRules robotRules) {
if (link == null || robotRules == null) {
throw new NullPointerException("Link argument or robot rules argument cannot be null");
}
Expand Down
184 changes: 92 additions & 92 deletions src/main/java/focusedCrawler/link/frontier/Frontier.java
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
package focusedCrawler.link.frontier;

import java.net.URLDecoder;
import java.util.HashSet;
import java.util.List;

import crawlercommons.robots.BaseRobotRules;
import focusedCrawler.util.persistence.PersistentHashtable;
import focusedCrawler.util.persistence.PersistentHashtable.DB;
import focusedCrawler.util.persistence.Tuple;
package focusedCrawler.link.frontier;

import java.net.URLDecoder;
import java.util.HashSet;
import java.util.List;

import crawlercommons.robots.SimpleRobotRules;
import focusedCrawler.util.persistence.PersistentHashtable;
import focusedCrawler.util.persistence.PersistentHashtable.DB;
import focusedCrawler.util.persistence.Tuple;
import focusedCrawler.util.persistence.TupleIterator;




public class Frontier {

protected PersistentHashtable<LinkRelevance> urlRelevance;

private final PersistentHashtable<BaseRobotRules> robotRulesMap;

public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend) {
this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize,
LinkRelevance.class, persistentHashtableBackend);
this.robotRulesMap = new PersistentHashtable<>(directory + "_robots", maxCacheUrlsSize,
BaseRobotRules.class, persistentHashtableBackend);
}

public void commit() {
urlRelevance.commit();
robotRulesMap.commit();
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/
protected PersistentHashtable<LinkRelevance> urlRelevance;

private final PersistentHashtable<SimpleRobotRules> robotRulesMap;

public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend) {
this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize,
LinkRelevance.class, persistentHashtableBackend);
this.robotRulesMap = new PersistentHashtable<>(directory + "_robots", maxCacheUrlsSize,
SimpleRobotRules.class, persistentHashtableBackend);
}

public void commit() {
urlRelevance.commit();
robotRulesMap.commit();
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/
@Deprecated
public HashSet<String> visitedAuths() throws Exception {
HashSet<String> result = new HashSet<String>();
Expand All @@ -45,19 +45,19 @@ public HashSet<String> visitedAuths() throws Exception {
}
return result;
}

public void visitedLinks(Visitor<LinkRelevance> visitor) throws Exception {
urlRelevance.visitTuples((Tuple<LinkRelevance> tuple) -> {
if (tuple.getValue().getRelevance() < 0) {
visitor.visit(tuple.getValue());
}
});
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/

public void visitedLinks(Visitor<LinkRelevance> visitor) throws Exception {
urlRelevance.visitTuples((Tuple<LinkRelevance> tuple) -> {
if (tuple.getValue().getRelevance() < 0) {
visitor.visit(tuple.getValue());
}
});
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/
@Deprecated
public HashSet<String> unvisitedAuths() throws Exception {
HashSet<String> result = new HashSet<String>();
Expand All @@ -70,11 +70,11 @@ public HashSet<String> unvisitedAuths() throws Exception {
}
return result;
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/
@Deprecated
public HashSet<String> visitedHubs() throws Exception {
HashSet<String> result = new HashSet<String>();
Expand All @@ -87,11 +87,11 @@ public HashSet<String> visitedHubs() throws Exception {
}
return result;
}

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/

/**
* DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
* iterator, and/or load just a sample of the data.
*/
@Deprecated
public HashSet<String> unvisitedHubs() throws Exception {
HashSet<String> result = new HashSet<String>();
Expand All @@ -114,7 +114,7 @@ public void update(LinkRelevance linkRelevance) {
}
}
}


/**
* This method inserts a new link into the frontier
*
Expand All @@ -123,9 +123,9 @@ public void update(LinkRelevance linkRelevance) {
* @throws FrontierPersistentException
*/
public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException {
if (linkRelev == null) {
return false;
}
if (linkRelev == null) {
return false;
}
boolean inserted = false;
String url = linkRelev.getURL().toString();
Double relevance = exist(linkRelev);
Expand All @@ -135,7 +135,7 @@ public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentExceptio
}

return inserted;
}
}

/**
* It verifies whether a given URL was already visited or does not belong to
Expand All @@ -148,10 +148,10 @@ public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentExceptio
public Double exist(LinkRelevance linkRelev) throws FrontierPersistentException {
LinkRelevance link = urlRelevance.get(linkRelev.getURL().toString());
return link == null ? null : link.getRelevance();
}

public LinkRelevance get(String url) throws FrontierPersistentException {
return urlRelevance.get(url);
}

public LinkRelevance get(String url) throws FrontierPersistentException {
return urlRelevance.get(url);
}

/**
Expand All @@ -166,40 +166,40 @@ public void delete(LinkRelevance linkRelevance) throws FrontierPersistentExcepti
if (exist(linkRelevance) != null) {
// we don't want to delete the URL file, it is useful to avoid visiting an old url
double relevance = linkRelevance.getRelevance();
double negativeRelevance = relevance > 0 ? -1*relevance : relevance;
urlRelevance.put(url, new LinkRelevance(linkRelevance.getURL(), negativeRelevance, linkRelevance.getType()));
double negativeRelevance = relevance > 0 ? -1*relevance : relevance;
urlRelevance.put(url, new LinkRelevance(linkRelevance.getURL(), negativeRelevance, linkRelevance.getType()));
}
}

public void close() {
urlRelevance.close();
urlRelevance.close();
robotRulesMap.close();
}

public TupleIterator<LinkRelevance> iterator() {
return urlRelevance.iterator();
}

/**
* Inserts the robot rules object into the HashMap
*
* @param link
* @param robotRules
* @throws NullPointerException
* when either of the argument is null
*/
public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
if (link == null || robotRules == null) {
throw new NullPointerException("Link argument or robot rules argument cannot be null");
}
String hostname = link.getURL().getHost();
robotRulesMap.put(hostname, robotRules);
}

public boolean isDisallowedByRobots(LinkRelevance link) {
String hostname = link.getURL().getHost();
BaseRobotRules rules = robotRulesMap.get(hostname);
return rules != null && !rules.isAllowed(link.getURL().toString());
}

}
}

/**
* Inserts the robot rules object into the HashMap
*
* @param link
* @param robotRules
* @throws NullPointerException
* when either of the argument is null
*/
public void insertRobotRules(LinkRelevance link, SimpleRobotRules robotRules) {
if (link == null || robotRules == null) {
throw new NullPointerException("Link argument or robot rules argument cannot be null");
}
String hostname = link.getURL().getHost();
robotRulesMap.put(hostname, robotRules);
}

public boolean isDisallowedByRobots(LinkRelevance link) {
String hostname = link.getURL().getHost();
SimpleRobotRules rules = robotRulesMap.get(hostname);
return rules != null && !rules.isAllowed(link.getURL().toString());
}

}
30 changes: 30 additions & 0 deletions src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
package focusedCrawler.integration;

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Arrays.asList;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.Matchers.lessThan;
import static org.hamcrest.Matchers.not;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import focusedCrawler.util.persistence.PersistentHashtable;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Rule;
Expand Down Expand Up @@ -164,6 +174,26 @@ public void test2ToNotToDownloadSitesDisallowedOnRobotsWithSitemapsFalse() throw
assertWasNotCrawled("http://127.0.0.1:1234/disallowed-link-2.html", frontier);
}

@Test
public void testKryoSerializationAndDeserialization() throws IOException {
final String simpleRobotsTxt = "User-agent: *" + "\r\n" + "Disallow:";

SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
SimpleRobotRules rules = (SimpleRobotRules) robotParser.parseContent("http://domain.com",
simpleRobotsTxt.getBytes(UTF_8), "text/plain", "Any-darn-crawler");

String outputPath = tempFolder.newFolder().toString();

PersistentHashtable<SimpleRobotRules> robotRulesMap = new PersistentHashtable<>(outputPath, 0,
SimpleRobotRules.class);
robotRulesMap.put("robots", rules);
robotRulesMap.commit();
rules = robotRulesMap.get("robots");

assertNotNull(rules);
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}

private void assertWasCrawled(String url, Frontier frontier) throws Exception {
LinkRelevance link = LinkRelevance.create("http://127.0.0.1:1234/" + url);
assertThat("URL=" + url, frontier.exist(link), is(lessThan(0d)));
Expand Down

0 comments on commit 807ca23

Please sign in to comment.