Skip to content

Commit

Permalink
Merge branch 'anudeepti2004-master' into issue46 (issue #46)
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jul 4, 2017
2 parents d9eb69d + b501f1d commit cad5b0f
Show file tree
Hide file tree
Showing 18 changed files with 265 additions and 45 deletions.
19 changes: 10 additions & 9 deletions src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package focusedCrawler.crawler.async;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpStatus;
import org.slf4j.Logger;
Expand All @@ -19,13 +17,16 @@
import focusedCrawler.util.storage.StorageException;

public class RobotsTxtHandler implements HttpDownloader.Callback {

@SuppressWarnings("serial")
public static class RobotsData implements Serializable {
public List<String> sitemapUrls = new ArrayList<>();
public String content;
public RobotsData(List<String> sitemapsUrls) {
this.sitemapUrls = sitemapsUrls;

public BaseRobotRules robotRules;
public LinkRelevance link;

public RobotsData(LinkRelevance link, BaseRobotRules robotRules) {
this.link = link;
this.robotRules = robotRules;
}
}

Expand Down Expand Up @@ -86,10 +87,10 @@ private void processRobot(LinkRelevance link, FetchedResult response, boolean fe
}

try {
RobotsData robotsData = new RobotsData(robotRules.getSitemaps());
RobotsData robotsData = new RobotsData(link, robotRules);
linkStorage.insert(robotsData);
} catch (StorageException | CommunicationException e) {
logger.error("Failed to insert robot.txt data into link storage.", e);
logger.error("Failed to insert robots.txt data into link storage.", e);
}

}
Expand Down
36 changes: 31 additions & 5 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.robots.BaseRobotRules;
import focusedCrawler.crawler.async.RobotsTxtHandler;
import focusedCrawler.crawler.async.SitemapXmlHandler;
import focusedCrawler.link.classifier.LinkClassifierFactory;
Expand Down Expand Up @@ -51,6 +52,9 @@ public class LinkStorage extends StorageDefault {
private final FrontierManager frontierManager;
private final OnlineLearning onlineLearning;

private final boolean insertSiteMaps;
private final boolean disallowSitesInRobotsTxt;

public LinkStorage(LinkStorageConfig config,
FrontierManager frontierManager) throws IOException {
this(config, frontierManager, null);
Expand All @@ -63,6 +67,8 @@ public LinkStorage(LinkStorageConfig config,
this.onlineLearning = onlineLearning;
this.getBacklinks = config.getBacklinks();
this.getOutlinks = config.getOutlinks();
this.disallowSitesInRobotsTxt = config.getDisallowSitesInRobotsFile();
this.insertSiteMaps = config.getDownloadSitemapXml();
}

public void close(){
Expand Down Expand Up @@ -93,11 +99,16 @@ else if(obj instanceof SitemapXmlHandler.SitemapData) {
}

public void insert(RobotsTxtHandler.RobotsData robotsData) {
for (String sitemap : robotsData.sitemapUrls) {
try {
frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
logger.error("Failed to insert sitemap from robot: "+sitemap);
if (disallowSitesInRobotsTxt) {
this.insertRobotRules(robotsData.link, robotsData.robotRules);
}
if (insertSiteMaps) {
for (String sitemap : robotsData.robotRules.getSitemaps()) {
try {
frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
logger.error("Failed to insert sitemap from robot: " + sitemap);
}
}
}
}
Expand Down Expand Up @@ -227,4 +238,19 @@ private static OnlineLearning createOnlineLearning(String dataPath, LinkStorageC
}
}

/**
* Inserts the robot rules object into the HashMap
*
* @param link
* @param robotRules
* @throws NullPointerException
* when either of the argument is null
*/
public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
if (link == null || robotRules == null) {
throw new NullPointerException("Link argument or robot rules argument cannot be null");
}
frontierManager.getFrontier().insertRobotRules(link, robotRules);
}

}
12 changes: 12 additions & 0 deletions src/main/java/focusedCrawler/link/LinkStorageConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ public String getMozKey() {
@JsonProperty("link_storage.download_sitemap_xml")
private boolean downloadSitemapXml = false;

@JsonProperty("link_storage.disallow_sites_in_robots_file")
private boolean disallowSitesInRobotsFile = false;

@JsonProperty("link_storage.recrawl_selector")
private String recrawlSelector = null;

Expand Down Expand Up @@ -183,6 +186,15 @@ public boolean getDownloadSitemapXml() {
return downloadSitemapXml;
}

/**
* Returns true if the user wants the disallowed sites in robots.txt to be skipped
*
* @return
*/
public boolean getDisallowSitesInRobotsFile() {
return disallowSitesInRobotsFile;
}

public int getSchedulerHostMinAccessInterval() {
return schedulerHostMinAccessInterval;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ private synchronized void loadQueue(int numberOfLinks) {

LinkRelevance link = it.next().getValue();

if (frontier.isDisallowedByRobots(link)) {
continue;
}

// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
if (recrawlSelector != null) {
Expand Down
37 changes: 32 additions & 5 deletions src/main/java/focusedCrawler/link/frontier/Frontier.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import java.util.Map;

import crawlercommons.robots.BaseRobotRules;
import focusedCrawler.util.persistence.PersistentHashtable;
import focusedCrawler.util.persistence.PersistentHashtable.DB;
import focusedCrawler.util.persistence.Tuple;
Expand All @@ -16,10 +17,12 @@ public class Frontier {

protected PersistentHashtable<LinkRelevance> urlRelevance;
protected Map<String, Integer> scope = null;
private boolean useScope = false;
private boolean useScope = false;
private final PersistentHashtable<BaseRobotRules> robotRulesMap;

public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend, Map<String, Integer> scope) {
this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize, LinkRelevance.class, persistentHashtableBackend);
this.robotRulesMap = new PersistentHashtable<>(directory+"_robots", maxCacheUrlsSize, BaseRobotRules.class, persistentHashtableBackend);

if (scope == null) {
this.useScope = false;
Expand All @@ -35,7 +38,8 @@ public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBa
}

public void commit() {
urlRelevance.commit();
urlRelevance.commit();
robotRulesMap.commit();
}

/**
Expand Down Expand Up @@ -188,11 +192,34 @@ public void delete(LinkRelevance linkRelevance) throws FrontierPersistentExcepti
}

public void close() {
urlRelevance.close();
urlRelevance.close();
robotRulesMap.close();
}

public TupleIterator<LinkRelevance> iterator() {
return urlRelevance.iterator();
}

/**
* Inserts the robot rules object into the HashMap
*
* @param link
* @param robotRules
* @throws NullPointerException
* when either of the argument is null
*/
public void insertRobotRules(LinkRelevance link, BaseRobotRules robotRules) {
if (link == null || robotRules == null) {
throw new NullPointerException("Link argument or robot rules argument cannot be null");
}
String hostname = link.getURL().getHost();
robotRulesMap.put(hostname, robotRules);
}

public boolean isDisallowedByRobots(LinkRelevance link) {
String hostname = link.getURL().getHost();
BaseRobotRules rules = robotRulesMap.get(hostname);
return rules != null && !rules.isAllowed(link.getURL().toString());
}

}
28 changes: 23 additions & 5 deletions src/main/java/focusedCrawler/link/frontier/FrontierManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ public class FrontierManager {
private final int linksToLoad;
private final HostManager hostsManager;
private final boolean downloadRobots;
private final boolean insertSitemaps;
private final boolean disallowSitesInRobotsFile;
private final LogFile schedulerLog;
private final MetricsManager metricsManager;

Expand All @@ -70,7 +72,9 @@ public FrontierManager(Frontier frontier, String dataPath, String modelPath,
this.frontier = frontier;
this.linkFilter = linkFilter;
this.metricsManager = metricsManager;
this.downloadRobots = config.getDownloadSitemapXml();
this.insertSitemaps = config.getDownloadSitemapXml();
this.disallowSitesInRobotsFile = config.getDisallowSitesInRobotsFile();
this.downloadRobots = getDownloadRobots();
this.linksToLoad = config.getSchedulerMaxLinks();
this.maxPagesPerDomain = config.getMaxPagesPerDomain();
this.domainCounter = new HashMap<String, Integer>();
Expand All @@ -96,17 +100,21 @@ public void forceReload() {
scheduler.reload();
}

public boolean isRelevant(LinkRelevance elem) throws FrontierPersistentException {
if (elem.getRelevance() <= 0) {
public boolean isRelevant(LinkRelevance link) throws FrontierPersistentException {
if (link.getRelevance() <= 0) {
return false;
}

if(disallowSitesInRobotsFile && frontier.isDisallowedByRobots(link)) {
return false;
}

Integer value = frontier.exist(elem);
Integer value = frontier.exist(link);
if (value != null) {
return false;
}

String url = elem.getURL().toString();
String url = link.getURL().toString();
if (linkFilter.accept(url) == false) {
return false;
}
Expand Down Expand Up @@ -332,4 +340,14 @@ public BipartiteGraphRepository getGraphRepository() {
return this.graphRepository;
}

/**
* Returns true if either the property to include sitemaps is true or disallow sites in
* robots.txt is true
*
* @return
*/
private boolean getDownloadRobots() {
return insertSitemaps || disallowSitesInRobotsFile;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ public void shouldParseLinksFromSitemapXml() throws Exception {

// then
assertThat(linkStorageMock.robotsData, is(notNullValue()));
assertThat(linkStorageMock.robotsData.sitemapUrls.size(), is(2));
assertThat(linkStorageMock.robotsData.sitemapUrls.get(0), is("http://www.example.com/example-sitemap/sitemap.xml"));
assertThat(linkStorageMock.robotsData.sitemapUrls.get(1), is("http://www.example.com/example-sitemap/sitemap-news.xml"));
assertThat(linkStorageMock.robotsData.robotRules.getSitemaps().size(), is(2));
assertThat(linkStorageMock.robotsData.robotRules.getSitemaps().get(0), is("http://www.example.com/example-sitemap/sitemap.xml"));
assertThat(linkStorageMock.robotsData.robotRules.getSitemaps().get(1), is("http://www.example.com/example-sitemap/sitemap-news.xml"));
}

}
Loading

0 comments on commit cad5b0f

Please sign in to comment.