From 7ba10357d8fd2b34b8eb6b9742f06ed6d566e1c0 Mon Sep 17 00:00:00 2001 From: Aecio Santos Date: Tue, 11 Jul 2017 19:10:25 -0400 Subject: [PATCH] Validate robot/sitemap urls before insert into the frontier --- .../java/focusedCrawler/link/LinkStorage.java | 14 +++--- .../link/frontier/CrawlScheduler.java | 38 ++++++++------- .../link/frontier/Frontier.java | 7 ++- .../link/frontier/FrontierManager.java | 7 ++- .../link/frontier/LinkRelevance.java | 48 +++++++++++++++++-- .../integration/RobotsAndSitemapTest.java | 4 +- .../robots_and_sitemap_test/static/robots.txt | 1 + .../static/sitemap1.xml | 3 ++ 8 files changed, 88 insertions(+), 34 deletions(-) diff --git a/src/main/java/focusedCrawler/link/LinkStorage.java b/src/main/java/focusedCrawler/link/LinkStorage.java index 67b358df8..94cd5255c 100644 --- a/src/main/java/focusedCrawler/link/LinkStorage.java +++ b/src/main/java/focusedCrawler/link/LinkStorage.java @@ -1,8 +1,6 @@ package focusedCrawler.link; import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -95,8 +93,8 @@ else if(obj instanceof SitemapXmlHandler.SitemapData) { public void insert(RobotsTxtHandler.RobotsData robotsData) { for (String sitemap : robotsData.sitemapUrls) { try { - frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP)); - } catch (MalformedURLException | FrontierPersistentException e) { + frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299)); + } catch (Exception e) { logger.error("Failed to insert sitemap from robot: "+sitemap); } } @@ -105,8 +103,8 @@ public void insert(RobotsTxtHandler.RobotsData robotsData) { public void insert(SitemapXmlHandler.SitemapData sitemapData) { for (String link : sitemapData.links) { try { - frontierManager.insert(new LinkRelevance(link, 1.0d, LinkRelevance.Type.FORWARD)); - } catch (MalformedURLException | FrontierPersistentException e) { + frontierManager.insert(LinkRelevance.createForward(link, 1.0d)); + } catch (Exception e) { logger.error("Failed to insert link into the frontier: "+link); } } @@ -114,8 +112,8 @@ public void insert(SitemapXmlHandler.SitemapData sitemapData) { for (String sitemap : sitemapData.sitemaps) { try { - frontierManager.insert(new LinkRelevance(new URL(sitemap), 299, LinkRelevance.Type.SITEMAP)); - } catch (MalformedURLException | FrontierPersistentException e) { + frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299)); + } catch (Exception e) { logger.error("Failed to insert sitemap into the frontier: "+sitemap); } } diff --git a/src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java b/src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java index 4029d0794..f0627689a 100644 --- a/src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java +++ b/src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java @@ -82,28 +82,30 @@ private synchronized void loadQueue(int numberOfLinks) { this.startSelection(numberOfLinks); while (it.hasNext()) { - - LinkRelevance link = it.next().getValue(); - - // Links already downloaded or not relevant - if (link.getRelevance() <= 0) { - if (recrawlSelector != null) { - recrawlSelector.evaluateLink(link); + try { + LinkRelevance link = it.next().getValue(); + + // Links already downloaded or not relevant + if (link.getRelevance() <= 0) { + if (recrawlSelector != null) { + recrawlSelector.evaluateLink(link); + } + continue; } - continue; - } - uncrawledLinks++; + uncrawledLinks++; - // check whether link can be download now according to politeness constraints - if (scheduler.canDownloadNow(link)) { - // consider link to be downloaded - linkSelector.evaluateLink(link); - linksAvailable++; - } else { - rejectedLinks++; + // check whether link can be download now according to politeness constraints + if (scheduler.canDownloadNow(link)) { + // consider link to be downloaded + linkSelector.evaluateLink(link); + linksAvailable++; + } else { + rejectedLinks++; + } + } catch (Exception e) { + // ignore exception and continue to load links even when some links fail } - } this.addSelectedLinksToScheduler(recrawlSelector); diff --git a/src/main/java/focusedCrawler/link/frontier/Frontier.java b/src/main/java/focusedCrawler/link/frontier/Frontier.java index 16255e324..48ea420cd 100644 --- a/src/main/java/focusedCrawler/link/frontier/Frontier.java +++ b/src/main/java/focusedCrawler/link/frontier/Frontier.java @@ -132,10 +132,13 @@ public void update(LinkRelevance linkRelevance) { * @throws FrontierPersistentException */ public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException { + if (linkRelev == null) { + return false; + } boolean inserted = false; String url = linkRelev.getURL().toString(); - Integer rel = exist(linkRelev); - if (rel == null && url.toString().length() < 210) { + Integer relevance = exist(linkRelev); + if (relevance == null) { urlRelevance.put(url, linkRelev); inserted = true; } diff --git a/src/main/java/focusedCrawler/link/frontier/FrontierManager.java b/src/main/java/focusedCrawler/link/frontier/FrontierManager.java index 8e7e88520..5adff697d 100644 --- a/src/main/java/focusedCrawler/link/frontier/FrontierManager.java +++ b/src/main/java/focusedCrawler/link/frontier/FrontierManager.java @@ -124,6 +124,9 @@ public void insert(LinkRelevance[] linkRelevance) throws FrontierPersistentExcep public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentException { Context timerContext = insertTimer.time(); try { + if (linkRelevance == null) { + return false; + } boolean insert = isRelevant(linkRelevance); if (insert) { if (downloadRobots) { @@ -132,8 +135,8 @@ public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentExce if (!hostsManager.isKnown(hostName)) { hostsManager.insert(hostName); try { - URL robotUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt"); - LinkRelevance sitemap = new LinkRelevance(robotUrl, 299, LinkRelevance.Type.ROBOTS); + URL robotsUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt"); + LinkRelevance sitemap = LinkRelevance.createRobots(robotsUrl.toString(), 299); frontier.insert(sitemap); } catch (Exception e) { logger.warn("Failed to insert robots.txt for host: " + hostName, e); diff --git a/src/main/java/focusedCrawler/link/frontier/LinkRelevance.java b/src/main/java/focusedCrawler/link/frontier/LinkRelevance.java index ef18b3cab..cf9dd01e9 100644 --- a/src/main/java/focusedCrawler/link/frontier/LinkRelevance.java +++ b/src/main/java/focusedCrawler/link/frontier/LinkRelevance.java @@ -28,6 +28,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Comparator; +import java.util.regex.Pattern; + +import org.apache.commons.validator.routines.UrlValidator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.core.JsonParser; @@ -46,6 +49,11 @@ public class LinkRelevance implements Serializable { public static double DEFAULT_HUB_RELEVANCE = 100; public static double DEFAULT_AUTH_RELEVANCE = 200; + private static final UrlValidator validator = new UrlValidator(new String[] {"http","https"}); + // .onion links aren't accepted by the validator + // Regex ".[^.]+" --> any string of at least 1 char without dot + private static final Pattern onionPattern = Pattern.compile("https?://.[^.]+\\.onion.*"); + public enum Type { FORWARD, ROBOTS, SITEMAP } @@ -73,15 +81,15 @@ public LinkRelevance() { // required for JSON serialization } - public LinkRelevance(String string, double relevance) throws MalformedURLException { - this(new URL(string), relevance); + public LinkRelevance(String url, double relevance) throws MalformedURLException { + this(new URL(url), relevance); } public LinkRelevance(URL url, double relevance) { this(url, relevance, Type.FORWARD); } - public LinkRelevance(String url, double relevance, Type type) throws MalformedURLException { + private LinkRelevance(String url, double relevance, Type type) throws MalformedURLException { this(new URL(url), relevance, type); } @@ -154,4 +162,38 @@ public URL deserialize(JsonParser parser, DeserializationContext ctxt) throws IO } } + public static LinkRelevance createForward(String url, double relevance) { + try { + if (isValid(url)) { + return new LinkRelevance(url, relevance, Type.FORWARD); + } + } catch (MalformedURLException e) { + } + return null; + } + + public static LinkRelevance createSitemap(String url, double relevance) { + try { + if (isValid(url)) { + return new LinkRelevance(url, relevance, Type.SITEMAP); + } + } catch (MalformedURLException e) { + } + return null; + } + + public static LinkRelevance createRobots(String url, double relevance) { + try { + if (isValid(url)) { + return new LinkRelevance(url, relevance, Type.ROBOTS); + } + } catch (MalformedURLException e) { + } + return null; + } + + private static boolean isValid(String url) { + return validator.isValid(url) || onionPattern.matcher(url).matches(); + } + } diff --git a/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java b/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java index 73c747262..c327dbd98 100644 --- a/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java +++ b/src/test/java/focusedCrawler/integration/RobotsAndSitemapTest.java @@ -71,7 +71,9 @@ public void shouldDownloadLinksListedOnSitemapsXml() throws Exception { ); List shouldNOTBeDownloaded = asList( - "not-listed-on-sitemaps.html" + "not-listed-on-sitemaps.html", + "http://.invalid-url.com/sitemap.xml", + "http://.invalid-url.com/invalid-url.xml" ); for (String url : shouldBeDownloaded) { diff --git a/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/robots.txt b/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/robots.txt index ab1fb2fdb..233e939ba 100644 --- a/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/robots.txt +++ b/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/robots.txt @@ -1,3 +1,4 @@ User-Agent: * Disallow: /admin/ Sitemap: ./sitemap-index.xml +Sitemap: http://.invalid-url.com/sitemap.xml diff --git a/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/sitemap1.xml b/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/sitemap1.xml index 2f08d2a4c..e31c99e6e 100644 --- a/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/sitemap1.xml +++ b/src/test/resources/focusedCrawler/integration/robots_and_sitemap_test/static/sitemap1.xml @@ -4,5 +4,8 @@ http://127.0.0.1:1234/page-listed-on-sitemap-1.html + + http://.invalid-url.com/invalid-url.xml +