Permalink
Fetching contributors…
Cannot retrieve contributors at this time
292 lines (226 sloc) 9.86 KB
# Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
#
# Cloudera, Inc. licenses this file to you under the Apache License,
# Version 2.0 (the "License"). You may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for
# the specific language governing permissions and limitations under the
# License.
oryx = {
# Optional string identifying the entire Oryx instance. Without it, Oryx has no
# longer-term identity with external systems like Kafka. For example, without this set,
# on startup the layers read from only the latest available input. With this set,
# Kafka can load the last offset read and let the layers resume reading where they left off.
id = null
# Configuration for the Kafka input topic
input-topic = {
# Comma-separated list of Kafka brokers, as host1:port1(,host2:port2,...)
broker = "localhost:9092"
lock = {
# Comma-separated list of Zookeeper masters, as host1:port1(,host2:port2,...)
# Note that if you need to specify a chroot, then weirdly, it should only appear
# at the end: host1:port1,host2:port2/chroot, not host1:port1/chroot,host2:port2/chroot
# The chroot is commonly "/kafka", like on CDH clusters.
master = "localhost:2181"
}
message = {
# Input topic
topic = "OryxInput"
# Key/message classes that the framework receives, respectively
key-class = "java.lang.String"
message-class = "java.lang.String"
# Decoder classes used to read/write key/value classes
key-decoder-class = "org.apache.kafka.common.serialization.StringDeserializer"
message-decoder-class = "org.apache.kafka.common.serialization.StringDeserializer"
}
}
# Configuration for the Kafka model update topic
update-topic = {
# Comma-separated list of Kafka brokers, as host1:port1(,host2:port2,...)
# Can be null to disable publishing to an update topic
broker = "localhost:9092"
lock = {
# Comma-separated list of Zookeeper masters, as host1:port1(,host2:port2,...)
# Note that if you need to specify a chroot, then weirdly, it should only appear
# at the end: host1:port1,host2:port2/chroot, not host1:port1/chroot,host2:port2/chroot
# The chroot is commonly "/kafka", like on CDH clusters.
# Can be null to disable publishing to an update topic
master = "localhost:2181"
}
message = {
# Update topic
# Can be null to disable publishing to an update topic
topic = "OryxUpdate"
# Decoder/encoder classes used to read/write key/value classes
decoder-class = "org.apache.kafka.common.serialization.StringDeserializer"
encoder-class = "org.apache.kafka.common.serialization.StringDeserializer"
# Max size in bytes of message to write to the update topic. Don't change this unless you
# know what you're doing. PMML models larger than this will be passed a location on HDFS,
# for example. This should match the max.message.bytes configured for the update topic.
# This value can't be larger than about 64MB in any event.
max-size = 16777216
}
}
# Default Spark key-value pairs used in batch, streaming
default-streaming-config = {
spark.io.compression.codec = "lzf"
spark.speculation = true
spark.logConf = true
spark.serializer = "org.apache.spark.serializer.KryoSerializer"
spark.ui.showConsoleProgress = false
}
# Batch layer configuration
batch = {
# Streaming framework configuration
streaming = {
# Spark Streaming master. If local[n], make sure n >= 2
master = "yarn"
deploy-mode = "client"
# Interval between runs of the computation layer. Default: 6 hours
generation-interval-sec = 21600
# Number of executors to start. In YARN-based deployments, this is a
# maximum, and fewer executors may be used when the process is idle if
# dynamic-allocation is enabled
num-executors = 4
# Cores per executor
executor-cores = 4
# Memory per executor
executor-memory = "2g"
# Heap size for the Batch driver process.
driver-memory = "1g"
# Enable dynamic allocation? YARN-only and not always desirable for streaming
dynamic-allocation = false
# Spark config key-value pairs
config = ${oryx.default-streaming-config}
}
# An implementation of com.cloudera.oryx.api.batch.BatchLayerUpdate
# which specifies what is done with current and historical data to update a model
update-class = null
storage = {
# Directory where historical data is stored. Can be local, or on HDFS, etc.
data-dir = "file:/tmp/Oryx/data/"
# Directory where models are output. Can be local, or on HDFS, etc.
model-dir = "file:/tmp/Oryx/model/"
# Writable classes used to persist key/value, respectively
key-writable-class = "org.apache.hadoop.io.Text"
message-writable-class = "org.apache.hadoop.io.Text"
# Data older than this many hours may be automatically deleted. -1 means no maximum.
max-age-data-hours = -1
# Models older than this many hours may be automatically deleted. -1 means no maximum.
max-age-model-hours = -1
}
# Configuration for the Spark UI
ui = {
# UI port
port = 4040
}
}
# Speed layer configuration
speed = {
# Streaming framework configuration
streaming = {
# Spark Streaming master. If local[n], make sure n >= 2
master = "yarn"
deploy-mode = "client"
# See tuning rule of thumb above
# Interval between runs of the computation layer in seconds. Default: 10 seconds
generation-interval-sec = 10
# Number of executors to start. In YARN-based deployments, this is a
# maximum, and fewer executors may be used when the process is idle if
# dynamic-allocation is enabled
num-executors = 2
# Cores per executor
executor-cores = 4
# Memory per executor
executor-memory = "1g"
# Heap size for the Speed driver process.
driver-memory = "512m"
# Enable dynamic allocation? YARN-only and not always desirable for streaming
dynamic-allocation = false
# Spark config key-value pairs
config = ${oryx.default-streaming-config}
}
# Implementation of com.cloudera.oryx.api.speed.SpeedModelManager interface that produces
# updates from a SpeedModel and stream of input
model-manager-class = null
# See doc for serving.min-model-serve-fraction below
min-model-load-fraction = 0.8
# Configuration for the Spark UI
ui = {
# UI port
port = 4040
}
}
# Serving layer configuration
serving = {
# Memory to allocate for each serving layer instance. Must be in MB for now, ending with 'm'
memory = "4000m"
# Optional config when using YARN-based deployment
yarn = {
# Number of serving layers to run
instances = 1
# Cores to allocate for each serving layer instance
cores = "4"
}
api = {
# Default to use well-known HTTP port for Serving Layer
port = 80
# Default to use well-known HTTPS port for Serving Layer
secure-port = 443
# User name for connecting to the service, if required. If set, must be set with password.
# If enabled, this will enable HTTP DIGEST authentication in the API.
user-name = null
# Password for connecting to the service, if required. If set, must be set with user-name.
# If enabled, this will enable HTTP DIGEST authentication in the API.
password = null
# The keystore file containing the server's SSL keys. Only necessary when
# accessing a server with temporary self-signed key, which is not trusted
# by the Java SSL implementation.
keystore-file = null
# Password needed for keystore file above, if any
keystore-password = null
# Key alias to use, if keystore has multiple keys
key-alias = null
# If true, operations that set or modify data, like /ingest, are not available
read-only = false
# An optional prefix for the path under which the service is deployed. For
# example, set to "/contextPath" to expose services at paths like "http://example.org/contextPath/..."
context-path = "/"
}
# Where to load application JAX-RS resources (one or more comma-separated Java package names)
application-resources = null
# Implementation of com.cloudera.oryx.api.serving.ServingModelManager interface
# that produces a ServingModel from stream of updates
model-manager-class = null
# Don't consider a model loaded and ready to use until this fraction of it is loaded.
# Some models load incrementally (e.g. ALS). Others load all at once, in which case this
# has no effect.
min-model-load-fraction = 0.8
# Test-only option; don't set this in general
no-init-topics = false
}
# ML tier configuration
ml = {
# Model evaluation settings
eval = {
# Fraction of current data that is used for test, versus training
test-fraction = 0.1
# Increase to build more candidate models per run, and pick the best one
candidates = 1
# "random" for random search; "grid" for grid search
hyperparam-search = "random"
# Number of models to build in parallel
parallelism = 1
# Minimum evaluation for a model to be accepted as valid, if specified.
# Note that some evaluation metrics, like squared error for k-means clustering,
# are better when smaller, and the framework ranks models based on the negative of
# such values. To specify, for example, a *maximum* squared error of 100 for k-means
# clustering, specify a threshold of -100.
threshold = null
}
}
}