Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
161 lines (127 sloc) 6.1 KB
##################### ACHE Configuration Example #####################
#
# Configurations for Target Storage
#
# Change to false if you don't want to store pages classified as irrelevant
target_storage.store_negative_pages: false
# Configuration for data format used to store crawled data
# Enable one of the following lines to use one of the FILESYSTEM_* data format
#target_storage.data_format.type: FILESYSTEM_HTML
#target_storage.data_format.type: FILESYSTEM_CBOR
#target_storage.data_format.type: FILESYSTEM_JSON
#
# Enable this to name files using a fixed-lenght hash instead of the percent-encoded URL
#target_storage.data_format.filesystem.hash_file_name: true
#
# Enable this to compress the file content
#target_storage.data_format.filesystem.compress_data: true
# Enable this to use the FILES data format
target_storage.data_format.type: FILES
target_storage.data_format.files.max_file_size: 134217728 # 128mb in bytes
# Enable following lines to use the WARC data format and change its default settings
#target_storage.data_format.type: WARC # enable WARC file format
#target_storage.data_format.warc.compress: true # enable GZIP compression
#target_storage.data_format.warc.max_file_size: 262144000 # maximum file size in bytes
# Enable following line to index pages in Elasticsearch
#target_storage.data_format.type: ELASTICSEARCH
#
# (Transport client ES 1.5)
#target_storage.data_format.elasticsearch.host: localhost
#target_storage.data_format.elasticsearch.port: 9300
#target_storage.data_format.elasticsearch.cluster_name: elasticsearch
#
# (REST client ES 1.x and 5.x)
#target_storage.data_format.elasticsearch.rest.hosts:
# - http://localhost:9200
#target_storage.data_format.elasticsearch.rest.connect_timeout: 30000
#target_storage.data_format.elasticsearch.rest.socket_timeout: 30000
#target_storage.data_format.elasticsearch.rest.max_retry_timeout_millis: 90000
# Instead of configuring a single data format, you can also configure multiple
# data formats in a list as follows. The settings for each data format should
# be configured independently, as if you were configuring a single data format.
# In the following config, data will be pushed to both FILES and ELASTICSEARCH.
#
#target_storage.data_formats:
# - FILES
# - ELASTICSEARCH
# Performs hard focus or soft focus. When hard focus is enabled,
# the crawler only follows links from pages classified as relevant
target_storage.hard_focus: true
# Run bipartite crawler
target_storage.bipartite: false
# Maximum number of pages to visit
target_storage.visited_page_limit: 10000000
# Store only pages that contain english text using language detector
target_storage.english_language_detection_enabled: true
#
# Configurations for Link Storage
#
# Max number of pages to be crawled from each web domain
link_storage.max_pages_per_domain: 100
# Restricts the crawler to crawl the websites provided as seeds
link_storage.link_strategy.use_scope: false
# Allows the crawler to follow forward links
link_storage.link_strategy.outlinks: true
# Gets backlinks of the pages from a search engine used by the bipartite crawling
link_storage.link_strategy.backlinks: false
# Type of link classifier used by link storage
# - LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided
# - LinkClassifierImpl: link strategy using a link classifier
# - LinkClassifierAuthority: link strategy for the bipartite crawling
link_storage.link_classifier.type: LinkClassifierBaseline
#link_storage.link_classifier.type: LinkClassifierImpl
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"]
# Restricts crawler to follow links within a given "hops" of the seeds
#link_storage.link_classifier.type: MaxDepthLinkClassifier
#link_storage.link_classifier.max_depth: 1
# Retrain link classifiers on-the-fly
link_storage.online_learning.enabled: false
# Type of online learning (FORWARD_CLASSIFIER_BINARY,FORWARD_CLASSIFIER_BINARY)
# - FORWARD_CLASSIFIER_BINARY: pos/neg link classifier
# - FORWARD_CLASSIFIER_LEVELS: contextual graph with 3 levels
#link_storage.online_learning.type: FORWARD_CLASSIFIER_BINARY
# Learn iteration criterion (every n pages runs online learning)
#link_storage.online_learning.learning_limit: 500
# Types of LinkSelectors available:
# - TopkLinkSelector
# - RandomLinkSelector
# - NonRandomLinkSelector
# - MultiLevelLinkSelector
# - MaximizeWebsitesLinkSelector
link_storage.link_selector: TopkLinkSelector
# Enable recrawling of sitemaps using at fixed time intervals (in minutes)
#link_storage.recrawl_selector: SitemapsRecrawlSelector
#link_storage.recrawl_selector.sitemaps.interval: 1
link_storage.max_size_cache_urls: 10000
# Directory to store link storage's frontier database
link_storage.directory: "data_url/dir"
# Backlink surfer parameters
#link_storage.backsurfer.moz.access_id: mozscape-xxxxxxxxxx
#link_storage.backsurfer.moz.secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
link_storage.scheduler.host_min_access_interval: 5000
link_storage.scheduler.max_links: 10000
#
# Configurations for Crawler Manager
#
crawler_manager.downloader.user_agent.name: ACHE
crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache
#crawler_manager.downloader.user_agent.email: someone@example.com
#crawler_manager.downloader.user_agent.string: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
crawler_manager.downloader.download_thread_pool_size: 100
crawler_manager.downloader.max_retry_count: 2
crawler_manager.downloader.valid_mime_types:
- text/xml
- text/html
- text/plain
- application/x-asp
- application/xhtml+xml
- application/vnd.wap.xhtml+xml
# Use OkHttpFetcher instead of SimpleHttpFetcher
crawler_manager.downloader.use_okhttp3_fetcher: true
# okhttp3 proxy Configuration
crawler_manager.downloader.okhttp3.proxy_host: null
crawler_manager.downloader.okhttp3.proxy_username: null
crawler_manager.downloader.okhttp3.proxy_password: null
crawler_manager.downloader.okhttp3.proxy_port: 8080
# Discovery of new links using sitemap.xml protocol
link_storage.download_sitemap_xml: false
You can’t perform that action at this time.