Replace uses of class BaseRobotRules by SimpleRobotRules

Replace uses of class BaseRobotRules by SimpleRobotRules to avoid problems with abstract class deserialization while caching robots.txt rules in the database
VIDA-NYU · May 14, 2018 · 807ca23 · 807ca23
1 parent 54fbd19
commit 807ca23
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 103 deletions.
diff --git a/src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java b/src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java
@@ -6,8 +6,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import crawlercommons.robots.BaseRobotRules;
-import crawlercommons.robots.BaseRobotsParser;
+import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
 import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
@@ -19,18 +18,18 @@ public class RobotsTxtHandler implements HttpDownloader.Callback {
     @SuppressWarnings("serial")
     public static class RobotsData implements Serializable {
 
-        public BaseRobotRules robotRules;
+        public SimpleRobotRules robotRules;
         public LinkRelevance link;
 
-        public RobotsData(LinkRelevance link, BaseRobotRules robotRules) {
+        public RobotsData(LinkRelevance link, SimpleRobotRules robotRules) {
             this.link = link;
             this.robotRules = robotRules;
         }
     }
 
     private static final Logger logger = LoggerFactory.getLogger(RobotsTxtHandler.class);
 
-    private BaseRobotsParser parser = new SimpleRobotRulesParser();
+    private SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
     private LinkStorage linkStorage;
     private String userAgentName;
 
@@ -65,17 +64,17 @@ public void failed(LinkRelevance link, Exception e) {
 
     private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {
 
-        BaseRobotRules robotRules;
+        SimpleRobotRules robotRules;
         if(fetchFailed || response == null) {
-            robotRules = parser.failedFetch(HttpStatus.SC_GONE);
+            robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
         }
         else {
             String contentType = response.getContentType();
             boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
             if ((response.getNumRedirects() > 0) && !isPlainText) {
-                robotRules = parser.failedFetch(HttpStatus.SC_GONE);
+                robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
             } else {
-                robotRules = parser.parseContent(
+                robotRules = (SimpleRobotRules) parser.parseContent(
                     response.getFetchedUrl(),
                     response.getContent(),
                     response.getContentType(),

diff --git a/src/main/java/focusedCrawler/link/LinkStorage.java b/src/main/java/focusedCrawler/link/LinkStorage.java
@@ -9,7 +9,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
 import focusedCrawler.crawler.async.RobotsTxtHandler;
 import focusedCrawler.crawler.async.SitemapXmlHandler;
 import focusedCrawler.link.classifier.LinkClassifierFactory;
@@ -220,7 +220,7 @@ private static OnlineLearning createOnlineLearning(String dataPath, LinkStorageC
      * @throws NullPointerException
      *             when either of the argument is null
      */
-    public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
+    public void insertRobotRules(LinkRelevance link, SimpleRobotRules robotRules) {
         if (link == null || robotRules == null) {
             throw new NullPointerException("Link argument or robot rules argument cannot be null");
         }

diff --git a/src/main/java/focusedCrawler/link/frontier/Frontier.java b/src/main/java/focusedCrawler/link/frontier/Frontier.java
@@ -1,38 +1,38 @@
-package focusedCrawler.link.frontier;
-
-import java.net.URLDecoder;
-import java.util.HashSet;
-import java.util.List;
-
-import crawlercommons.robots.BaseRobotRules;
-import focusedCrawler.util.persistence.PersistentHashtable;
-import focusedCrawler.util.persistence.PersistentHashtable.DB;
-import focusedCrawler.util.persistence.Tuple;
+package focusedCrawler.link.frontier;
+
+import java.net.URLDecoder;
+import java.util.HashSet;
+import java.util.List;
+
+import crawlercommons.robots.SimpleRobotRules;
+import focusedCrawler.util.persistence.PersistentHashtable;
+import focusedCrawler.util.persistence.PersistentHashtable.DB;
+import focusedCrawler.util.persistence.Tuple;
 import focusedCrawler.util.persistence.TupleIterator;
-
-
+
+
 public class Frontier {
 
-    protected PersistentHashtable<LinkRelevance> urlRelevance;
-
-    private final PersistentHashtable<BaseRobotRules> robotRulesMap;
-
-    public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend) {
-        this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize,
-                LinkRelevance.class, persistentHashtableBackend);
-        this.robotRulesMap = new PersistentHashtable<>(directory + "_robots", maxCacheUrlsSize,
-                BaseRobotRules.class, persistentHashtableBackend);
-    }
-
-    public void commit() {
-        urlRelevance.commit();
-        robotRulesMap.commit();
-    }
-
-    /**
-     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
-     * iterator, and/or load just a sample of the data.
-     */
+    protected PersistentHashtable<LinkRelevance> urlRelevance;
+
+    private final PersistentHashtable<SimpleRobotRules> robotRulesMap;
+
+    public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend) {
+        this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize,
+                LinkRelevance.class, persistentHashtableBackend);
+        this.robotRulesMap = new PersistentHashtable<>(directory + "_robots", maxCacheUrlsSize,
+                SimpleRobotRules.class, persistentHashtableBackend);
+    }
+
+    public void commit() {
+        urlRelevance.commit();
+        robotRulesMap.commit();
+    }
+
+    /**
+     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
+     * iterator, and/or load just a sample of the data.
+     */
     @Deprecated
     public HashSet<String> visitedAuths() throws Exception {
         HashSet<String> result = new HashSet<String>();
@@ -45,19 +45,19 @@ public HashSet<String> visitedAuths() throws Exception {
         }
         return result;
     }
-
-    public void visitedLinks(Visitor<LinkRelevance> visitor) throws Exception {
-        urlRelevance.visitTuples((Tuple<LinkRelevance> tuple) -> {
-            if (tuple.getValue().getRelevance() < 0) {
-                visitor.visit(tuple.getValue());
-            }
-        });
-    }
-
-    /**
-     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
-     * iterator, and/or load just a sample of the data.
-     */
+
+    public void visitedLinks(Visitor<LinkRelevance> visitor) throws Exception {
+        urlRelevance.visitTuples((Tuple<LinkRelevance> tuple) -> {
+            if (tuple.getValue().getRelevance() < 0) {
+                visitor.visit(tuple.getValue());
+            }
+        });
+    }
+
+    /**
+     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
+     * iterator, and/or load just a sample of the data.
+     */
     @Deprecated
     public HashSet<String> unvisitedAuths() throws Exception {
         HashSet<String> result = new HashSet<String>();
@@ -70,11 +70,11 @@ public HashSet<String> unvisitedAuths() throws Exception {
         }
         return result;
     }
-
-    /**
-     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
-     * iterator, and/or load just a sample of the data.
-     */
+
+    /**
+     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
+     * iterator, and/or load just a sample of the data.
+     */
     @Deprecated
     public HashSet<String> visitedHubs() throws Exception {
         HashSet<String> result = new HashSet<String>();
@@ -87,11 +87,11 @@ public HashSet<String> visitedHubs() throws Exception {
         }
         return result;
     }
-
-    /**
-     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
-     * iterator, and/or load just a sample of the data.
-     */
+
+    /**
+     * DEPRECATED: may cause OutOfMemoryError on large crawls. TODO: Provide an method that uses an
+     * iterator, and/or load just a sample of the data.
+     */
     @Deprecated
     public HashSet<String> unvisitedHubs() throws Exception {
         HashSet<String> result = new HashSet<String>();
@@ -114,7 +114,7 @@ public void update(LinkRelevance linkRelevance) {
             }
         }
     }
-
+
     /**
      * This method inserts a new link into the frontier
      * 
@@ -123,9 +123,9 @@ public void update(LinkRelevance linkRelevance) {
      * @throws FrontierPersistentException
      */
     public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException {
-        if (linkRelev == null) {
-            return false;
-        }
+        if (linkRelev == null) {
+            return false;
+        }
         boolean inserted = false;
         String url = linkRelev.getURL().toString();
         Double relevance = exist(linkRelev);
@@ -135,7 +135,7 @@ public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentExceptio
         }
 
         return inserted;
-    }
+    }
 
     /**
      * It verifies whether a given URL was already visited or does not belong to
@@ -148,10 +148,10 @@ public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentExceptio
     public Double exist(LinkRelevance linkRelev) throws FrontierPersistentException {
         LinkRelevance link = urlRelevance.get(linkRelev.getURL().toString());
         return link == null ? null : link.getRelevance();
-    }
-
-    public LinkRelevance get(String url) throws FrontierPersistentException {
-        return urlRelevance.get(url);
+    }
+
+    public LinkRelevance get(String url) throws FrontierPersistentException {
+        return urlRelevance.get(url);
     }
 
     /**
@@ -166,40 +166,40 @@ public void delete(LinkRelevance linkRelevance) throws FrontierPersistentExcepti
         if (exist(linkRelevance) != null) {
             // we don't want to delete the URL file, it is useful to avoid visiting an old url
             double relevance = linkRelevance.getRelevance();
-            double negativeRelevance = relevance > 0 ? -1*relevance : relevance;
-            urlRelevance.put(url, new LinkRelevance(linkRelevance.getURL(), negativeRelevance, linkRelevance.getType()));
+            double negativeRelevance = relevance > 0 ? -1*relevance : relevance;
+            urlRelevance.put(url, new LinkRelevance(linkRelevance.getURL(), negativeRelevance, linkRelevance.getType()));
         }
     }
 
     public void close() {
-        urlRelevance.close();
+        urlRelevance.close();
         robotRulesMap.close();
     }
 
     public TupleIterator<LinkRelevance> iterator() {
         return urlRelevance.iterator();
-    }
-
-    /**
-     * Inserts the robot rules object into the HashMap
-     * 
-     * @param link
-     * @param robotRules
-     * @throws NullPointerException
-     *             when either of the argument is null
-     */
-    public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
-        if (link == null || robotRules == null) {
-            throw new NullPointerException("Link argument or robot rules argument cannot be null");
-        }
-        String hostname = link.getURL().getHost();
-        robotRulesMap.put(hostname, robotRules);
-    }
-
-    public boolean isDisallowedByRobots(LinkRelevance link) {
-        String hostname = link.getURL().getHost();
-        BaseRobotRules rules = robotRulesMap.get(hostname);
-        return rules != null && !rules.isAllowed(link.getURL().toString());
-    }
-
-}
+    }
+
+    /**
+     * Inserts the robot rules object into the HashMap
+     * 
+     * @param link
+     * @param robotRules
+     * @throws NullPointerException
+     *             when either of the argument is null
+     */
+    public void insertRobotRules(LinkRelevance link, SimpleRobotRules robotRules) {
+        if (link == null || robotRules == null) {
+            throw new NullPointerException("Link argument or robot rules argument cannot be null");
+        }
+        String hostname = link.getURL().getHost();
+        robotRulesMap.put(hostname, robotRules);
+    }
+
+    public boolean isDisallowedByRobots(LinkRelevance link) {
+        String hostname = link.getURL().getHost();
+        SimpleRobotRules rules = robotRulesMap.get(hostname);
+        return rules != null && !rules.isAllowed(link.getURL().toString());
+    }
+
+}
diff --git a/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java b/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java
@@ -1,16 +1,26 @@
 package focusedCrawler.integration;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static java.util.Arrays.asList;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.CoreMatchers.nullValue;
 import static org.hamcrest.Matchers.lessThan;
 import static org.hamcrest.Matchers.not;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.List;
 
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRulesParser;
+import focusedCrawler.util.persistence.PersistentHashtable;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Rule;
@@ -164,6 +174,26 @@ public void test2ToNotToDownloadSitesDisallowedOnRobotsWithSitemapsFalse() throw
         assertWasNotCrawled("http://127.0.0.1:1234/disallowed-link-2.html", frontier);
     }
 
+    @Test
+    public void testKryoSerializationAndDeserialization() throws IOException {
+        final String simpleRobotsTxt = "User-agent: *" + "\r\n" + "Disallow:";
+
+        SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+        SimpleRobotRules rules = (SimpleRobotRules) robotParser.parseContent("http://domain.com",
+                simpleRobotsTxt.getBytes(UTF_8), "text/plain", "Any-darn-crawler");
+
+        String outputPath = tempFolder.newFolder().toString();
+
+        PersistentHashtable<SimpleRobotRules> robotRulesMap = new PersistentHashtable<>(outputPath, 0,
+                SimpleRobotRules.class);
+        robotRulesMap.put("robots", rules);
+        robotRulesMap.commit();
+        rules = robotRulesMap.get("robots");
+
+        assertNotNull(rules);
+        assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
+    }
+
     private void assertWasCrawled(String url, Frontier frontier) throws Exception {
         LinkRelevance link = LinkRelevance.create("http://127.0.0.1:1234/" + url);
         assertThat("URL=" + url, frontier.exist(link), is(lessThan(0d)));