Skip to content

Commit

Permalink
Merge 7b51f57 into 3d7ea2a
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Mar 28, 2016
2 parents 3d7ea2a + 7b51f57 commit 84bab18
Show file tree
Hide file tree
Showing 23 changed files with 1,094 additions and 1,449 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies {
compile 'org.apache.commons:commons-compress:1.9'
compile 'org.apache.httpcomponents:httpclient:4.4.1'
compile 'org.apache.tika:tika-parsers:1.9'
compile 'com.github.crawler-commons:crawler-commons:0.6'

testCompile 'junit:junit:4.12'
testCompile 'org.hamcrest:hamcrest-all:1.3'
Expand Down
25 changes: 11 additions & 14 deletions config/sample_config/ache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ target_storage.english_language_detection_enabled: true

# Configurations for target storage's server
target_storage.server.host: localhost
target_storage.server.port : 1987
target_storage.server.port: 1987

#
# Configurations for Link Storage
Expand All @@ -80,7 +80,7 @@ link_storage.link_strategy.backlinks: false
# - LinkClassifierAuthority: link strategy for the bipartite crawling
link_storage.link_classifier.type: LinkClassifierBaseline
#link_storage.link_classifier.type: LinkClassifierImpl
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"] #CLASS_VALUES 0 1 2
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"]

# Retrain link classifiers on-the-fly
link_storage.online_learning.enabled: false
Expand Down Expand Up @@ -124,15 +124,12 @@ link_storage.backsurfer.pattern_end_title: "\",\"uu\":"
#
# Configurations for Crawler Manager
#
crawler_manager.robot_mananger.thread_group: crawler_group
crawler_manager.robot_mananger.resting_time: 10
crawler_manager.robot_mananger.check_time: 10000
crawler_manager.robot_mananger.max_time: 10000
crawler_manager.robot_mananger.robot_error_sleep_time: 5000
crawler_manager.robot_mananger.thread_factor: 10
crawler_manager.robot_mananger.quantity: 5

crawler_manager.downloader.max_blocked_threads: 200000

#crawler_manager.downloader.user_agent: "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0)"
#crawler_manager.downloader.request_accept: "Accept | */*"
crawler_manager.scheduler.host_min_access_interval: 5000
crawler_manager.scheduler.max_links: 10000
crawler_manager.downloader.download_thread_pool_size: 100
crawler_manager.downloader.max_retry_count: 2
crawler_manager.downloader.user_agent.name: ACHE
crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache
crawler_manager.downloader.valid_mime_types:
- text/html
- text/plain
22 changes: 10 additions & 12 deletions src/main/java/focusedCrawler/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
import org.slf4j.LoggerFactory;

import focusedCrawler.config.ConfigService;
import focusedCrawler.crawler.CrawlerManager;
import focusedCrawler.crawler.CrawlerManagerException;
import focusedCrawler.crawler.async.AsyncCrawler;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.classifier.LinkClassifierFactoryException;
import focusedCrawler.link.frontier.AddSeeds;
Expand Down Expand Up @@ -265,7 +264,8 @@ private static void startTargetStorage(CommandLine cmd) throws MissingArgumentEx
private static void startCrawlManager(final String configPath) {
try {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
CrawlerManager.run(config);
AsyncCrawler.run(config);

} catch (Throwable t) {
logger.error("Something bad happened to CrawlManager :(", t);
}
Expand All @@ -288,19 +288,17 @@ private static void startCrawl(CommandLine cmd) throws MissingArgumentException
dataOutputPath, modelPath, config.getLinkStorageConfig());

// start target storage
Storage targetStorage = TargetStorage.createTargetStorage(configPath,
modelPath, dataOutputPath, elasticIndexName,
Storage targetStorage = TargetStorage.createTargetStorage(
configPath, modelPath, dataOutputPath, elasticIndexName,
config.getTargetStorageConfig(), linkStorage);


AsyncCrawler.Config crawlerConfig = config.getCrawlerConfig();

// start crawl manager
CrawlerManager manager = CrawlerManager.createCrawlerManager(
config.getCrawlerManagerConfig(), linkStorage, targetStorage);
manager.start();
AsyncCrawler crawler = new AsyncCrawler(targetStorage, (LinkStorage) linkStorage, crawlerConfig);
crawler.run();

}
catch (CrawlerManagerException e) {
logger.error("Problem while creating CrawlerManager", e);
}
catch (LinkClassifierFactoryException | FrontierPersistentException e) {
logger.error("Problem while creating LinkStorage", e);
}
Expand Down
11 changes: 5 additions & 6 deletions src/main/java/focusedCrawler/config/ConfigService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;

import focusedCrawler.crawler.CrawlerManagerConfig;
import focusedCrawler.crawler.async.AsyncCrawler;
import focusedCrawler.link.LinkStorageConfig;

public class ConfigService {
Expand All @@ -21,8 +21,7 @@ public class ConfigService {

private TargetStorageConfig targetStorageConfig;
private LinkStorageConfig linkStorageConfig;
private CrawlerManagerConfig crawlerManagerConfig;

private AsyncCrawler.Config crawlerConfig;

public ConfigService(String configFilePath) {
this(Paths.get(configFilePath));
Expand All @@ -33,7 +32,7 @@ public ConfigService(Path configFilePath) {
JsonNode config = yamlMapper.readTree(configFilePath.toFile());
this.targetStorageConfig = new TargetStorageConfig(config, yamlMapper);
this.linkStorageConfig = new LinkStorageConfig(config, yamlMapper);
this.crawlerManagerConfig = new CrawlerManagerConfig(config, yamlMapper);
this.crawlerConfig = new AsyncCrawler.Config(config, yamlMapper);
} catch (IOException e) {
throw new IllegalArgumentException("Could not read settings from file: "+configFilePath, e);
}
Expand All @@ -47,8 +46,8 @@ public LinkStorageConfig getLinkStorageConfig() {
return linkStorageConfig;
}

public CrawlerManagerConfig getCrawlerManagerConfig() {
return crawlerManagerConfig;
public AsyncCrawler.Config getCrawlerConfig() {
return crawlerConfig;
}

}
36 changes: 0 additions & 36 deletions src/main/java/focusedCrawler/config/TargetStorageConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import com.fasterxml.jackson.databind.ObjectMapper;

import focusedCrawler.target.elasticsearch.ElasticSearchConfig;
import focusedCrawler.util.ParameterFile;
import focusedCrawler.util.storage.StorageConfig;

public class TargetStorageConfig {
Expand Down Expand Up @@ -63,41 +62,6 @@ public static class MonitorConfig {

private final StorageConfig serverConfig;

@Deprecated
public TargetStorageConfig(String filename) {
this(new ParameterFile(filename));
}

@Deprecated
public TargetStorageConfig(ParameterFile params) {
this.useClassifier = params.getParamBoolean("USE_CLASSIFIER");
this.targetStorageDirectory = params.getParam("TARGET_STORAGE_DIRECTORY");
this.negativeStorageDirectory = params.getParam("NEGATIVE_STORAGE_DIRECTORY");
this.dataFormat = params.getParamOrDefault("DATA_FORMAT", "FILE");

this.monitor = new MonitorConfig();
this.monitor.sync = params.getParamBoolean("REFRESH_SYNC");
this.monitor.frequency = params.getParamInt("SYNC_REFRESH_FREQUENCY");
this.monitor.frequencyCrawled = params.getParamInt("CRAWLED_REFRESH_FREQUENCY");
this.monitor.frequencyRelevant = params.getParamInt("RELEVANT_REFRESH_FREQUENCY");
this.monitor.frequencyHarvestInfo = params.getParamInt("HARVESTINFO_REFRESH_FREQUENCY");

this.hashFileName = params.getParamBooleanOrDefault("HASH_FILE_NAME", false);
this.compressData = params.getParamBooleanOrDefault("COMPRESS_DATA", false);
this.relevanceThreshold = params.getParamFloat("RELEVANCE_THRESHOLD");
this.visitedPageLimit = params.getParamInt("VISITED_PAGE_LIMIT");
this.hardFocus = params.getParamBoolean("HARD_FOCUS");
this.bipartite = params.getParamBoolean("BIPARTITE");
this.saveNegativePages = params.getParamBoolean("SAVE_NEGATIVE_PAGES");
this.englishLanguageDetectionEnabled = params.getParamBooleanOrDefault("ENGLISH_LANGUAGE_DETECTION_ENABLED", true);

String elasticSearchHost = params.getParamOrDefault("ELASTICSEARCH_HOST", "localhost");
int elasticSearchPort = params.getParamIntOrDefault("ELASTICSEARCH_PORT", 9300);
String clusterName = params.getParamOrDefault("ELASTICSEARCH_CLUSTERNAME", "elasticsearch");
this.elasticSearchConfig = new ElasticSearchConfig(elasticSearchHost, elasticSearchPort, clusterName);
this.serverConfig = new StorageConfig(params);
}

public TargetStorageConfig(JsonNode config, ObjectMapper objectMapper) throws IOException {
objectMapper.readerForUpdating(this).readValue(config);
this.serverConfig = StorageConfig.create(config, "target_storage.server.");
Expand Down
Loading

0 comments on commit 84bab18

Please sign in to comment.