Skip to content

Commit

Permalink
Validate robot/sitemap urls before insert into the frontier
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jul 11, 2017
1 parent 025462a commit 7ba1035
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 34 deletions.
14 changes: 6 additions & 8 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package focusedCrawler.link;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -95,8 +93,8 @@ else if(obj instanceof SitemapXmlHandler.SitemapData) {
public void insert(RobotsTxtHandler.RobotsData robotsData) {
for (String sitemap : robotsData.sitemapUrls) {
try {
frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299));
} catch (Exception e) {
logger.error("Failed to insert sitemap from robot: "+sitemap);
}
}
Expand All @@ -105,17 +103,17 @@ public void insert(RobotsTxtHandler.RobotsData robotsData) {
public void insert(SitemapXmlHandler.SitemapData sitemapData) {
for (String link : sitemapData.links) {
try {
frontierManager.insert(new LinkRelevance(link, 1.0d, LinkRelevance.Type.FORWARD));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createForward(link, 1.0d));
} catch (Exception e) {
logger.error("Failed to insert link into the frontier: "+link);
}
}
logger.info("Added {} URLs from sitemap.", sitemapData.links.size());

for (String sitemap : sitemapData.sitemaps) {
try {
frontierManager.insert(new LinkRelevance(new URL(sitemap), 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
frontierManager.insert(LinkRelevance.createSitemap(sitemap, 299));
} catch (Exception e) {
logger.error("Failed to insert sitemap into the frontier: "+sitemap);
}
}
Expand Down
38 changes: 20 additions & 18 deletions src/main/java/focusedCrawler/link/frontier/CrawlScheduler.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,28 +82,30 @@ private synchronized void loadQueue(int numberOfLinks) {
this.startSelection(numberOfLinks);

while (it.hasNext()) {

LinkRelevance link = it.next().getValue();

// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
if (recrawlSelector != null) {
recrawlSelector.evaluateLink(link);
try {
LinkRelevance link = it.next().getValue();

// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
if (recrawlSelector != null) {
recrawlSelector.evaluateLink(link);
}
continue;
}
continue;
}

uncrawledLinks++;
uncrawledLinks++;

// check whether link can be download now according to politeness constraints
if (scheduler.canDownloadNow(link)) {
// consider link to be downloaded
linkSelector.evaluateLink(link);
linksAvailable++;
} else {
rejectedLinks++;
// check whether link can be download now according to politeness constraints
if (scheduler.canDownloadNow(link)) {
// consider link to be downloaded
linkSelector.evaluateLink(link);
linksAvailable++;
} else {
rejectedLinks++;
}
} catch (Exception e) {
// ignore exception and continue to load links even when some links fail
}

}

this.addSelectedLinksToScheduler(recrawlSelector);
Expand Down
7 changes: 5 additions & 2 deletions src/main/java/focusedCrawler/link/frontier/Frontier.java
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,13 @@ public void update(LinkRelevance linkRelevance) {
* @throws FrontierPersistentException
*/
public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException {
if (linkRelev == null) {
return false;
}
boolean inserted = false;
String url = linkRelev.getURL().toString();
Integer rel = exist(linkRelev);
if (rel == null && url.toString().length() < 210) {
Integer relevance = exist(linkRelev);
if (relevance == null) {
urlRelevance.put(url, linkRelev);
inserted = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ public void insert(LinkRelevance[] linkRelevance) throws FrontierPersistentExcep
public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentException {
Context timerContext = insertTimer.time();
try {
if (linkRelevance == null) {
return false;
}
boolean insert = isRelevant(linkRelevance);
if (insert) {
if (downloadRobots) {
Expand All @@ -132,8 +135,8 @@ public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentExce
if (!hostsManager.isKnown(hostName)) {
hostsManager.insert(hostName);
try {
URL robotUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
LinkRelevance sitemap = new LinkRelevance(robotUrl, 299, LinkRelevance.Type.ROBOTS);
URL robotsUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
LinkRelevance sitemap = LinkRelevance.createRobots(robotsUrl.toString(), 299);
frontier.insert(sitemap);
} catch (Exception e) {
logger.warn("Failed to insert robots.txt for host: " + hostName, e);
Expand Down
48 changes: 45 additions & 3 deletions src/main/java/focusedCrawler/link/frontier/LinkRelevance.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Comparator;
import java.util.regex.Pattern;

import org.apache.commons.validator.routines.UrlValidator;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.core.JsonParser;
Expand All @@ -46,6 +49,11 @@ public class LinkRelevance implements Serializable {
public static double DEFAULT_HUB_RELEVANCE = 100;
public static double DEFAULT_AUTH_RELEVANCE = 200;

private static final UrlValidator validator = new UrlValidator(new String[] {"http","https"});
// .onion links aren't accepted by the validator
// Regex ".[^.]+" --> any string of at least 1 char without dot
private static final Pattern onionPattern = Pattern.compile("https?://.[^.]+\\.onion.*");

public enum Type {
FORWARD, ROBOTS, SITEMAP
}
Expand Down Expand Up @@ -73,15 +81,15 @@ public LinkRelevance() {
// required for JSON serialization
}

public LinkRelevance(String string, double relevance) throws MalformedURLException {
this(new URL(string), relevance);
public LinkRelevance(String url, double relevance) throws MalformedURLException {
this(new URL(url), relevance);
}

public LinkRelevance(URL url, double relevance) {
this(url, relevance, Type.FORWARD);
}

public LinkRelevance(String url, double relevance, Type type) throws MalformedURLException {
private LinkRelevance(String url, double relevance, Type type) throws MalformedURLException {
this(new URL(url), relevance, type);
}

Expand Down Expand Up @@ -154,4 +162,38 @@ public URL deserialize(JsonParser parser, DeserializationContext ctxt) throws IO
}
}

public static LinkRelevance createForward(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.FORWARD);
}
} catch (MalformedURLException e) {
}
return null;
}

public static LinkRelevance createSitemap(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.SITEMAP);
}
} catch (MalformedURLException e) {
}
return null;
}

public static LinkRelevance createRobots(String url, double relevance) {
try {
if (isValid(url)) {
return new LinkRelevance(url, relevance, Type.ROBOTS);
}
} catch (MalformedURLException e) {
}
return null;
}

private static boolean isValid(String url) {
return validator.isValid(url) || onionPattern.matcher(url).matches();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ public void shouldDownloadLinksListedOnSitemapsXml() throws Exception {
);

List<String> shouldNOTBeDownloaded = asList(
"not-listed-on-sitemaps.html"
"not-listed-on-sitemaps.html",
"http://.invalid-url.com/sitemap.xml",
"http://.invalid-url.com/invalid-url.xml"
);

for (String url : shouldBeDownloaded) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
User-Agent: *
Disallow: /admin/
Sitemap: ./sitemap-index.xml
Sitemap: http://.invalid-url.com/sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@
<url>
<loc>http://127.0.0.1:1234/page-listed-on-sitemap-1.html</loc>
</url>
<url>
<loc>http://.invalid-url.com/invalid-url.xml</loc>
</url>

</urlset>

0 comments on commit 7ba1035

Please sign in to comment.