Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#58: UnknownHostExceptions handled #91

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ config/escort
docs/_build/
src/main/resources/public/
config/config_docker/data-*
config/config_docker_tor/data-*
config/config_docker_tor/data-*
.idea/
buildAndRun.sh
classes/
*.iml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public AsyncCrawler(Storage targetStorage, Storage linkStorage,
this.linkStorage = linkStorage;
this.downloader = new HttpDownloader(crawlerConfig.getDownloaderConfig(), dataPath, metricsManager);

this.handlers.put(LinkRelevance.Type.FORWARD, new FetchedResultHandler(targetStorage));
this.handlers.put(LinkRelevance.Type.FORWARD, new FetchedResultHandler(linkStorage, targetStorage));
this.handlers.put(LinkRelevance.Type.SITEMAP, new SitemapXmlHandler(linkStorage));
this.handlers.put(LinkRelevance.Type.ROBOTS, new RobotsTxtHandler(linkStorage, crawlerConfig.getDownloaderConfig().getUserAgentName()));

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package focusedCrawler.crawler.async;

import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.crawler.crawlercommons.fetcher.IOFetchException;

import java.net.UnknownHostException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
Expand All @@ -16,8 +20,10 @@ public class FetchedResultHandler implements HttpDownloader.Callback {
private static final Logger logger = LoggerFactory.getLogger(FetchedResultHandler.class);

private Storage targetStorage;
private Storage linkStorage;

public FetchedResultHandler(Storage targetStorage) {
public FetchedResultHandler(Storage linkStorage, Storage targetStorage) {
this.linkStorage = linkStorage;
this.targetStorage = targetStorage;
}

Expand All @@ -38,7 +44,12 @@ public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(), afe.getAbortReason());
} else {
}else if (e.getCause() instanceof UnknownHostException){
IOFetchException iofe = (IOFetchException) e;
((LinkStorage)linkStorage).addToBlackList(iofe.getUrl());
logger.info("UnknownHostException - Domain added to Blacklist. URL: "+iofe.getUrl());
}
else {
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(), e.getMessage());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,11 @@ private FetchedResult doRequest(HttpRequestBase request, String url, Payload pay
throw new IOFetchException(url, e);
}
} catch (IOException e) {
//save redirected urls in case of IOExceptions (to add to blacklist)
Integer redirects = (Integer) localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
if (redirects != null){
url = extractRedirectedUrl(url,localContext);
}
// Oleg guarantees that no abort is needed in the case of an
// IOException
needAbort = false;
Expand Down
84 changes: 57 additions & 27 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import java.nio.file.Path;
import java.nio.file.Paths;

import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -30,17 +33,17 @@
import focusedCrawler.util.storage.distribution.StorageBinder;
import focusedCrawler.util.string.StopList;
import focusedCrawler.util.string.StopListFile;
/**
*
* <p>Description: This class receives links to be inserted
* in frontier, sends links to crawler and starts the link storage server.</p>
*
* <p>Copyright: Copyright (c) 2004</p>
*
* @author Luciano Barbosa
* @version 1.0
*/

/**
*
* <p>Description: This class receives links to be inserted
* in frontier, sends links to crawler and starts the link storage server.</p>
*
* <p>Copyright: Copyright (c) 2004</p>
*
* @author Luciano Barbosa
* @version 1.0
*/
public class LinkStorage extends StorageDefault {

public static final Logger logger = LoggerFactory.getLogger(LinkStorage.class);
Expand All @@ -50,12 +53,13 @@ public class LinkStorage extends StorageDefault {

private final FrontierManager frontierManager;
private final OnlineLearning onlineLearning;
private final Set<String> blackList = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());

public LinkStorage(LinkStorageConfig config,
FrontierManager frontierManager) throws IOException {
this(config, frontierManager, null);
}

public LinkStorage(LinkStorageConfig config,
FrontierManager frontierManager,
OnlineLearning onlineLearning) throws IOException {
Expand All @@ -66,23 +70,23 @@ public LinkStorage(LinkStorageConfig config,
}

public void close(){

logger.info("Shutting down FrontierManager...");
this.frontierManager.close();
logger.info("done.");
}

/**
* This method inserts links from a given page into the frontier
*
*
* @param obj
* Object - page containing links
* @return Object
*/
public Object insert(Object obj) throws StorageException {
if(obj instanceof Page) {
return insert((Page) obj);
}
}
else if(obj instanceof RobotsTxtHandler.RobotsData) {
insert((RobotsTxtHandler.RobotsData) obj);
}
Expand All @@ -91,7 +95,7 @@ else if(obj instanceof SitemapXmlHandler.SitemapData) {
}
return null;
}

public void insert(RobotsTxtHandler.RobotsData robotsData) {
for (String sitemap : robotsData.sitemapUrls) {
try {
Expand All @@ -101,7 +105,7 @@ public void insert(RobotsTxtHandler.RobotsData robotsData) {
}
}
}

public void insert(SitemapXmlHandler.SitemapData sitemapData) {
for (String link : sitemapData.links) {
try {
Expand All @@ -111,7 +115,7 @@ public void insert(SitemapXmlHandler.SitemapData sitemapData) {
}
}
logger.info("Added {} URLs from sitemap.", sitemapData.links.size());

for (String sitemap : sitemapData.sitemaps) {
try {
frontierManager.insert(new LinkRelevance(new URL(sitemap), 299, LinkRelevance.Type.SITEMAP));
Expand All @@ -121,8 +125,8 @@ public void insert(SitemapXmlHandler.SitemapData sitemapData) {
}
logger.info("Added {} child sitemaps.", sitemapData.sitemaps.size());
}


public Object insert(Page page) throws StorageException {
try {
if (getBacklinks && page.isAuth()) {
Expand Down Expand Up @@ -151,16 +155,42 @@ public Object insert(Page page) throws StorageException {

/**
* This method sends a link to crawler
* @throws DataNotFoundException
* @throws DataNotFoundException
*/
public synchronized Object select(Object obj) throws StorageException, DataNotFoundException {
try {
return frontierManager.nextURL();
LinkRelevance link = frontierManager.nextURL();
if(!blackList.contains(link.getTopLevelDomainName())){
return link;
}else {
logger.info("Dead Domain ignored: "+link.getTopLevelDomainName());
return select(null);
}
} catch (FrontierPersistentException e) {
throw new StorageException(e.getMessage(), e);
}
}

public synchronized void addToBlackList(String url){
try{
blackList.add(new LinkRelevance(url,0d).getTopLevelDomainName());
}catch (MalformedURLException mue){
logger.info("MalformedURLException: "+url);
}
}

public synchronized void removeFromBlackList(String url){
try{
blackList.remove(new LinkRelevance(url,0d).getTopLevelDomainName());
}catch (MalformedURLException mue){
logger.info("MalformedURLException: "+url);
}
}

public Set<String> getBlackList(){
return blackList;
}

public static void runServer(String configPath, String seedFilePath,
String dataOutputPath, String modelPath,
LinkStorageConfig config)
Expand All @@ -177,22 +207,22 @@ public static void runServer(String configPath, String seedFilePath,
logger.error("Problem while starting LinkStorage.", e);
}
}
public static Storage createLinkStorage(String configPath, String seedFile,

public static Storage createLinkStorage(String configPath, String seedFile,
String dataPath, String modelPath,
LinkStorageConfig config,
MetricsManager metricsManager)
throws FrontierPersistentException,
IOException {

Path stoplistPath = Paths.get(configPath, "/stoplist.txt");
StopList stoplist;
if(Files.exists(stoplistPath)) {
stoplist = new StopListFile(stoplistPath.toFile().getCanonicalPath());
} else {
stoplist = StopListFile.DEFAULT;
}

LinkClassifierFactory.setDefaultStoplist(stoplist);

FrontierManager frontierManager = FrontierManagerFactory.create(config, configPath, dataPath, modelPath, seedFile, metricsManager);
Expand All @@ -201,7 +231,7 @@ public static Storage createLinkStorage(String configPath, String seedFile,
if (config.isUseOnlineLearning()) {
onlineLearning = createOnlineLearning(dataPath, config, stoplist, frontierManager);
}

return new LinkStorage(config, frontierManager, onlineLearning);
}

Expand Down
105 changes: 105 additions & 0 deletions src/test/java/focusedCrawler/link/LinkStorageTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package focusedCrawler.link;

import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertThat;

import com.google.common.collect.ImmutableMap;
import focusedCrawler.config.ConfigService;
import focusedCrawler.crawler.async.HttpDownloader;
import focusedCrawler.crawler.async.HttpDownloaderConfig;
import focusedCrawler.link.frontier.Frontier;
import focusedCrawler.link.frontier.FrontierManager;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.link.frontier.selector.LinkSelector;
import focusedCrawler.link.frontier.selector.RandomLinkSelector;
import focusedCrawler.util.DataNotFoundException;
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.persistence.PersistentHashtable.DB;
import focusedCrawler.util.storage.Storage;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Map;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

public class LinkStorageTest {

@Rule
// a new temp folder is created for each test case
public TemporaryFolder tempFolder = new TemporaryFolder();
private LinkFilter emptyLinkFilter = new LinkFilter(new ArrayList<String>());
private MetricsManager metricsManager = new MetricsManager();
private LinkStorageConfig config = new LinkStorageConfig();
private LinkSelector linkSelector = new RandomLinkSelector();

private Frontier frontier;
private String dataPath;
private String modelPath;
private LinkStorage linkStorage;
private Storage targetStorage;
private FrontierManager frontierManager;
private HttpDownloader downloader;

private int minimumAccessTimeInterval = 0;
private int schedulerMaxLinks = 2;
private boolean downloadSitemapXml = false;

@Before
public void setUp() throws Exception {
frontier = new Frontier(tempFolder.newFolder().toString(), 1000, DB.ROCKSDB);
dataPath = tempFolder.newFolder().toString();
modelPath = tempFolder.newFolder().toString();
Map<?, ?> props = ImmutableMap.of(
"link_storage.scheduler.max_links", schedulerMaxLinks,
"link_storage.scheduler.host_min_access_interval", minimumAccessTimeInterval,
"link_storage.download_sitemap_xml", downloadSitemapXml
);
config = new ConfigService(props).getLinkStorageConfig();
frontierManager = new FrontierManager(frontier, dataPath, modelPath, config,
linkSelector, null, emptyLinkFilter, metricsManager);
linkStorage = new LinkStorage(config,frontierManager);
targetStorage = null;
downloader = new HttpDownloader(new HttpDownloaderConfig(), dataPath, metricsManager);
}

@After
public void tearDown() throws IOException {
}

@Test
public void testingBlackList() {
String url = "http://www.deaddomain123213123.com/";
linkStorage.addToBlackList(url);

LinkRelevance link = null;
try {
link = new LinkRelevance(new URL(url), 1d);
}catch (MalformedURLException mue){}

assertThat(link,is(notNullValue()));
assertThat(linkStorage.getBlackList(), is(notNullValue()));
assertThat(linkStorage.getBlackList().contains(link.getTopLevelDomainName()), is(notNullValue()));
assertThat(linkStorage.getBlackList().contains(link.getTopLevelDomainName()), is(true));

DataNotFoundException dnfe = null;
try {
frontierManager.insert(link);
linkStorage.select(null);
}catch (Exception e) {
if(e instanceof DataNotFoundException){
dnfe = (DataNotFoundException) e;
}
}

assertThat(dnfe, is(notNullValue()));
assertThat(dnfe.ranOutOfLinks(), is(true));
}

}