Skip to content

Commit

Permalink
Merge ff3efe6 into 44fd61b
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Apr 8, 2016
2 parents 44fd61b + ff3efe6 commit f4378c1
Show file tree
Hide file tree
Showing 25 changed files with 1,100 additions and 1,455 deletions.
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ dependencies {
compile 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.5.4'
compile 'com.syncthemall:boilerpipe:1.2.2'
compile 'com.sleepycat:je:3.3.75' // BerkeleyDB
compile 'net.sourceforge.htmlunit:htmlunit:2.8'
compile 'org.apache.commons:commons-lang3:3.4'
compile 'org.apache.commons:commons-compress:1.9'
compile 'org.apache.httpcomponents:httpclient:4.4.1'
compile 'org.apache.tika:tika-parsers:1.9'
compile 'com.github.crawler-commons:crawler-commons:0.6'

testCompile 'junit:junit:4.12'
testCompile 'org.hamcrest:hamcrest-all:1.3'
Expand Down
25 changes: 11 additions & 14 deletions config/sample_config/ache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ target_storage.english_language_detection_enabled: true

# Configurations for target storage's server
target_storage.server.host: localhost
target_storage.server.port : 1987
target_storage.server.port: 1987

#
# Configurations for Link Storage
Expand All @@ -80,7 +80,7 @@ link_storage.link_strategy.backlinks: false
# - LinkClassifierAuthority: link strategy for the bipartite crawling
link_storage.link_classifier.type: LinkClassifierBaseline
#link_storage.link_classifier.type: LinkClassifierImpl
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"] #CLASS_VALUES 0 1 2
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"]

# Retrain link classifiers on-the-fly
link_storage.online_learning.enabled: false
Expand Down Expand Up @@ -124,15 +124,12 @@ link_storage.backsurfer.pattern_end_title: "\",\"uu\":"
#
# Configurations for Crawler Manager
#
crawler_manager.robot_mananger.thread_group: crawler_group
crawler_manager.robot_mananger.resting_time: 10
crawler_manager.robot_mananger.check_time: 10000
crawler_manager.robot_mananger.max_time: 10000
crawler_manager.robot_mananger.robot_error_sleep_time: 5000
crawler_manager.robot_mananger.thread_factor: 10
crawler_manager.robot_mananger.quantity: 5

crawler_manager.downloader.max_blocked_threads: 200000

#crawler_manager.downloader.user_agent: "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0)"
#crawler_manager.downloader.request_accept: "Accept | */*"
crawler_manager.scheduler.host_min_access_interval: 5000
crawler_manager.scheduler.max_links: 10000
crawler_manager.downloader.download_thread_pool_size: 100
crawler_manager.downloader.max_retry_count: 2
crawler_manager.downloader.user_agent.name: ACHE
crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache
crawler_manager.downloader.valid_mime_types:
- text/html
- text/plain
22 changes: 10 additions & 12 deletions src/main/java/focusedCrawler/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
import org.slf4j.LoggerFactory;

import focusedCrawler.config.ConfigService;
import focusedCrawler.crawler.CrawlerManager;
import focusedCrawler.crawler.CrawlerManagerException;
import focusedCrawler.crawler.async.AsyncCrawler;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.classifier.LinkClassifierFactoryException;
import focusedCrawler.link.frontier.AddSeeds;
Expand Down Expand Up @@ -223,7 +222,8 @@ private static void startTargetStorage(CommandLine cmd) throws MissingArgumentEx
private static void startCrawlManager(final String configPath) {
try {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
CrawlerManager.run(config);
AsyncCrawler.run(config);

} catch (Throwable t) {
logger.error("Something bad happened to CrawlManager :(", t);
}
Expand All @@ -246,19 +246,17 @@ private static void startCrawl(CommandLine cmd) throws MissingArgumentException
dataOutputPath, modelPath, config.getLinkStorageConfig());

// start target storage
Storage targetStorage = TargetStorage.createTargetStorage(configPath,
modelPath, dataOutputPath, elasticIndexName,
Storage targetStorage = TargetStorage.createTargetStorage(
configPath, modelPath, dataOutputPath, elasticIndexName,
config.getTargetStorageConfig(), linkStorage);


AsyncCrawler.Config crawlerConfig = config.getCrawlerConfig();

// start crawl manager
CrawlerManager manager = CrawlerManager.createCrawlerManager(
config.getCrawlerManagerConfig(), linkStorage, targetStorage);
manager.start();
AsyncCrawler crawler = new AsyncCrawler(targetStorage, (LinkStorage) linkStorage, crawlerConfig);
crawler.run();

}
catch (CrawlerManagerException e) {
logger.error("Problem while creating CrawlerManager", e);
}
catch (LinkClassifierFactoryException | FrontierPersistentException e) {
logger.error("Problem while creating LinkStorage", e);
}
Expand Down
11 changes: 5 additions & 6 deletions src/main/java/focusedCrawler/config/ConfigService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;

import focusedCrawler.crawler.CrawlerManagerConfig;
import focusedCrawler.crawler.async.AsyncCrawler;
import focusedCrawler.link.LinkStorageConfig;
import focusedCrawler.target.TargetStorageConfig;

Expand All @@ -22,8 +22,7 @@ public class ConfigService {

private TargetStorageConfig targetStorageConfig;
private LinkStorageConfig linkStorageConfig;
private CrawlerManagerConfig crawlerManagerConfig;

private AsyncCrawler.Config crawlerConfig;

public ConfigService(String configFilePath) {
this(Paths.get(configFilePath));
Expand All @@ -34,7 +33,7 @@ public ConfigService(Path configFilePath) {
JsonNode config = yamlMapper.readTree(configFilePath.toFile());
this.targetStorageConfig = new TargetStorageConfig(config, yamlMapper);
this.linkStorageConfig = new LinkStorageConfig(config, yamlMapper);
this.crawlerManagerConfig = new CrawlerManagerConfig(config, yamlMapper);
this.crawlerConfig = new AsyncCrawler.Config(config, yamlMapper);
} catch (IOException e) {
throw new IllegalArgumentException("Could not read settings from file: "+configFilePath, e);
}
Expand All @@ -48,8 +47,8 @@ public LinkStorageConfig getLinkStorageConfig() {
return linkStorageConfig;
}

public CrawlerManagerConfig getCrawlerManagerConfig() {
return crawlerManagerConfig;
public AsyncCrawler.Config getCrawlerConfig() {
return crawlerConfig;
}

}
Loading

0 comments on commit f4378c1

Please sign in to comment.