From f9fc37b16a3a75a233a9653cb8752ee125a0f959 Mon Sep 17 00:00:00 2001 From: Juliet Hougland Date: Wed, 11 Mar 2015 21:45:44 -0700 Subject: [PATCH] Use reflow for Maven generated site. Migrates wiki pages to src/site/markdown/docs. --- .gitignore | 2 +- pom.xml | 28 +- src/site/markdown/contribute.md | 4 + src/site/markdown/docs/api-end-pt-ref.md | 45 ++ .../markdown/docs/faq-and-troubleshooting.md | 8 + src/site/markdown/docs/how-to-release.md | 72 +++ src/site/markdown/docs/index.md | 490 ++++++++++++++++++ src/site/markdown/docs/oryx-1-diff.md | 72 +++ .../markdown/docs/oryx-2-first-release.md | 24 + src/site/markdown/download.md | 4 + src/site/markdown/index.md | 40 ++ src/site/markdown/overview.md | 130 +++++ src/site/site.xml | 34 +- 13 files changed, 944 insertions(+), 9 deletions(-) create mode 100644 src/site/markdown/contribute.md create mode 100644 src/site/markdown/docs/api-end-pt-ref.md create mode 100644 src/site/markdown/docs/faq-and-troubleshooting.md create mode 100644 src/site/markdown/docs/how-to-release.md create mode 100644 src/site/markdown/docs/index.md create mode 100644 src/site/markdown/docs/oryx-1-diff.md create mode 100644 src/site/markdown/docs/oryx-2-first-release.md create mode 100644 src/site/markdown/download.md create mode 100644 src/site/markdown/index.md create mode 100644 src/site/markdown/overview.md diff --git a/.gitignore b/.gitignore index 509a2633e..734acf9d6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ pom.xml.next release.properties *.iml .idea/ -dependency-reduced-pom.xml \ No newline at end of file +dependency-reduced-pom.xml diff --git a/pom.xml b/pom.xml index 013b37f56..44b35155b 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,9 @@ the specific language governing permissions and limitations under the License. --> - + 4.0.0 com.cloudera.oryx @@ -517,7 +519,7 @@ enforce - + 3.2.1 @@ -728,6 +730,25 @@ org.apache.maven.plugins maven-site-plugin 3.4 + + + org.apache.maven.doxia + doxia-module-markdown + 1.6 + + + lt.velykis.maven.skins + reflow-velocity-tools + Must depend on snapshot version for markdown to render correctly. <--> + 1.1.2-SNAPSHOT + + + + org.apache.velocity + velocity + 1.7 + + org.apache.maven.plugins @@ -966,7 +987,8 @@ applications on top of Spark, Spark Streaming and Kafka. On this, it provides further support for real-time, large scale machine learning, and end-to-end applications of this support for common machine learning use cases, like recommendations, clustering, - classification and regression. + classification and regression. + https://github.com/OryxProject/oryx 2014 diff --git a/src/site/markdown/contribute.md b/src/site/markdown/contribute.md new file mode 100644 index 000000000..77e0f8a48 --- /dev/null +++ b/src/site/markdown/contribute.md @@ -0,0 +1,4 @@ +# Contribute + +* [Issues](https://github.com/OryxProject/oryx/issues/) +* [Fork on GitHub](https://github.com/OryxProject/oryx/) \ No newline at end of file diff --git a/src/site/markdown/docs/api-end-pt-ref.md b/src/site/markdown/docs/api-end-pt-ref.md new file mode 100644 index 000000000..c5f38fcc8 --- /dev/null +++ b/src/site/markdown/docs/api-end-pt-ref.md @@ -0,0 +1,45 @@ +Javadoc +======= + +See project [javadoc](http://oryxproject.github.io/oryx/apidocs/index.html). + +Bundled Serving Layer Apps +========================== + +Oryx bundles several end-to-end applications, including a Serving Layer with REST endpoints. +The bundled app endpoints are: + +Collaborative filtering / Recommendation +---------------------------------------- + +* [`/recommend`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Recommend.html) +* [`/recommendToMany`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/RecommendToMany.html) +* [`/recommendToAnonymous`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/RecommendToAnonymous.html) +* [`/similarity`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Similarity.html) +* [`/similarityToItem`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/SimilarityToItem.html) +* [`/knownItems`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/KnownItems.html) +* [`/estimate`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Estimate.html) +* [`/estimateForAnonymous`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/EstimateForAnonymous.html) +* [`/because`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Because.html) +* [`/mostSurprising`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/MostSurprising.html) +* [`/popularRepresentativeItems`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/PopularRepresentativeItems.html) +* [`/mostPopularItems`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/MostPopularItems.html) +* [`/mostActiveUsers`] (http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/MostActiveUsers.html) +* [`/item/allIDs`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/AllItemIDs.html) +* [`/ready`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Ready.html) +* [`/pref`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Preference.html) +* [`/ingest`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/als/Ingest.html) + +Classification / Regression +--------------------------- + +* [`/predict`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/rdf/Predict.html) +* [`/classificationDistribution`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/rdf/ClassificationDistribution.html) +* [`/train`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/rdf/Train.html) + +Clustering +---------- + +* [`/assign`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/kmeans/Assign.html) +* [`/distanceToNearest`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/kmeans/DistanceToNearest.html) +* [`/add`](http://oryxproject.github.io/oryx/apidocs/com/cloudera/oryx/app/serving/kmeans/Add.html) diff --git a/src/site/markdown/docs/faq-and-troubleshooting.md b/src/site/markdown/docs/faq-and-troubleshooting.md new file mode 100644 index 000000000..45960ea0f --- /dev/null +++ b/src/site/markdown/docs/faq-and-troubleshooting.md @@ -0,0 +1,8 @@ +## Tests show errors, but still pass? + +[This issue](https://github.com/OryxProject/oryx/issues/73) catalogs the problem. These seem +to be ignorable errors that are attributable to the fact that a Zookeeper and Kafka process +are started and stopped rapidly. Although this is done cleanly with some built-in waiting, it +does not seem sufficient. + +There is likely a better answer but exceptions shown in the issue above can be ignored. diff --git a/src/site/markdown/docs/how-to-release.md b/src/site/markdown/docs/how-to-release.md new file mode 100644 index 000000000..1b3e13d68 --- /dev/null +++ b/src/site/markdown/docs/how-to-release.md @@ -0,0 +1,72 @@ +This will only be of interest to the project's maintainers! + +# Prerequisites + +This process requires that you have created a file called, for example, `private-settings.xml` which contains authentication credentials for pushing to the distribution repository. Example: + +``` + + + + + oryx.repo + snapshots + ... + + + oryx.snapshots.repo + snapshots + ... + + + +``` + +It also requires that you have the GPG key that is written into the project POM, and that you know its passphrase. + +# Releasing Maven Artifacts + +1. Clone `master` from the repo: +`git clone https://github.com/OryxProject/oryx.git` + +1. If this is a fresh checkout, optionally configure your user name and email for use with git commits, if not already set globally: +`git config user.name "Your Name"` +`git config user.email "Your Email"` + +1. Double-check that tests pass and packaging succeeds first: +`mvn clean package` + +1. Check for problems or errors first with `-DdryRun`. Consider skipping the (lengthy) tests in these steps with `-DskipTests` if they've been run already. To avoid answering the same question many times, the release and new development versions can be supplied on the command line: +`mvn -Darguments="-DskipTests" -DdryRun -DreleaseVersion=... -DdevelopmentVersion=... release:prepare` + +1. Repeat the above without `-DdryRun`. + +1. Now perform the release. This will require the `gpg` passphrase for the GPG signing key specified in `pom.xml`: +`mvn -s private-settings.xml -Darguments="-DskipTests -Dgpg.passphrase=..." release:perform` + +# Releasing Binaries + +1. To get the latest changes and tags post-build, `git pull --tags` +1. Checkout the build tag for this build with `git checkout tags/...` +1. `mvn -DskipTests clean package` +1. Assembled binaries appear at `oryx-serving/target/oryx-serving-....jar` and likewise for `speed` and `batch` +1. Navigate to the Github release that was just created, at `https://github.com/OryxProject/oryx/releases/tag/...` +1. Edit the title to something more meaningful like `Oryx x.y.z` +1. Paste brief release notes into the description, including a link to resolved issues for the associated milestone, usually of the form `https://github.com/OryxProject/oryx/issues?q=milestone%3A...+is%3Aclosed` +1. Attach the Batch, Speed, and Serving layer binaries and save the updated release. + +# Updating the Site + +1. Using the repo above, checkout the release tag: `git checkout -f tags/...` +1. `mvn clean site:site site:stage` +1. In another location, checkout the site branch into a new directory `oryx-gh-pages`: +`git clone -b gh-pages https://github.com/OryxProject/oryx.git oryx-gh-pages` +You may need to set `user.name` and `user.email` as above if it's a fresh clone. +1. `rm -r .../oryx-gh-pages/*` +1. `cp -r target/staging/* .../oryx-gh-pages/` +1. `cd .../oryx-gh-pages` +1. `git add -A .` +1. `git commit -m "Update site for ..."` +1. `git push origin gh-pages` +1. In a minute, check your work at http://oryxproject.github.io/oryx/ +1. Optionally delete the repo cloned above if no longer needed. \ No newline at end of file diff --git a/src/site/markdown/docs/index.md b/src/site/markdown/docs/index.md new file mode 100644 index 000000000..edd8ebc30 --- /dev/null +++ b/src/site/markdown/docs/index.md @@ -0,0 +1,490 @@ +# Documentation + +* [JavaDoc and Other Project Reports](../project-reports.html) +* [API Endpoint Reference](api-end-pt-ref.html) +* [FAQ and Trouble Shooting](faq-and-troubleshooting.html) +* [Differences From Oryx 1](oryx-1-diff.html) +* [Oryx 2 First Release](oryx-2-first-release.html) +* [Wiki](https://github.com/cloudera/oryx/wiki) + +# Build From Source +## Requirements + +Building from source requires: + +* [`git`](http://git-scm.com/), or an IDE that supports Git +* [Apache Maven](http://maven.apache.org/) 3.0.0 or later +* [Java JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html) (not just JRE) 6 or later + +Some or all of these may already be installed on your development machine. + +## Build + +Clone the repository from GitHub in your desired local directory, which will create `oryx`. Build it: + +```bash +git clone https://github.com/cloudera/oryx.git +cd oryx +mvn -DskipTests install +``` + +This will build the following binaries: + +* Serving Layer: `serving/target/oryx-serving-x.y.z.jar` +* Computation Layer: `computation/target/oryx-computation-x.y.z.jar` + +# Developing from Source + +Note that if you are interested in developing on Oryx, you should probably [fork this repository](https://help.github.com/articles/fork-a-repo) and then work on your own fork, so that you can submit pull requests with changes. + +## Older Hadoop Version (< 2.3.0) Note + +To use Oryx with versions of Hadoop 2.x prior to 2.3.0, it is necessary to create compatible binaries by recompiling against the specific version of Hadoop you're using. To do so, use the `hadoop200` profile and set the `hadoop.version` property. + +`mvn -phadoop200 -Dhadoop.version=...` + +# Cluster Setup +## Requirements + +- Java 7 or later (JRE only is required) (_In the near future, Java 8 may be required_) +- A Hadoop cluster running the following components: + - Apache Hadoop 2.5.0 or later + - Apache Zookeeper 3.4.5 or later + - Apache Kafka 0.8.2 or later + - Apache Spark 1.3.0 or later + +[CDH](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh.html) +5.4.0 and later meet these requirements, although any Hadoop distribution with these +components should work fine. While the rest of the instructions will refer to a CDH 5.4.0+ +distribution, this is not a requirement. + +_Note that the "alpha 1" release requires only Spark 1.2.0, and thus works with CDH 5.3.0+_ + +A single-node cluster can be sufficient, although running all of these components on one machine +may require a reasonable amount of RAM. + +## Cluster Setup + +Install and configure the Hadoop cluster normally. The following services need to be enabled: + +- HDFS +- YARN +- Zookeeper +- Kafka +- Spark (on YARN) + +Note that for CDH, Kafka is available as a "CLABS 1.0.0" parcel from +[Cloudera Labs](http://www.cloudera.com/content/cloudera/en/developers/home/cloudera-labs/apache-kafka.html). + +Determine the (possibly several) Kafka brokers that are configured in the cluster, under Instances, +and note their hosts and port. The port is typically 9092. Same for the Zookeeper servers; the default +port here is 2181. Default ports will be used in subsequent examples. + +Where a Kafka broker or Zookeeper server is called for, you can and should specify a comma-separated +list of `host:port` pairs where there are multiple hosts. Example: `your-zk-1:2181,your-zk-2:2181`. + +Also note whether your Zookeeper instance is using a chroot path. This is simply a path suffixed +to the `host:port`, like `your-zk:2181/your-chroot`. +For example in CDH, Kafka uses a `/kafka` chroot, and subsequent examples will +use this chroot. You can omit this if you are not using a chroot. + +Note: if you have multiple Zookeeper servers, and a chroot, only add the chroot once, at +the end: `your-zk-1:2181,your-zk-2:2181/kafka` + +## Verifying Kafka (Optional) + +To quickly verify that Kafka and ZK are running correctly: + +```bash +kafka-topics --create --zookeeper your-zk:2181/kafka \ + --replication-factor 1 --partitions 1 --topic test +kafka-console-consumer --zookeeper your-zk:2181/kafka \ + --topic test --from-beginning +``` + +In another console, take any text file (here `data.csv`) and send it to the topic: + +```bash +cat data.csv | kafka-console-producer \ + --broker-list your-kafka-broker:9092 --topic test +``` + +You should see the contents of the text file echoed onto the other consumer's console soon thereafter. + +Delete the test topic when done. + +```bash +kafka-topics --delete --zookeeper your-zk:2181/kafka --topic test +``` + +## Configuring Kafka + + +Oryx will use two Kafka topics for data transport. One carries input data to the batch and +Speed Layer, and the other carries model updates from there on to the Serving Layer. The default +names of these topics are "OryxInput" and "OryxUpdate" respectively. They need to be +created before Oryx is started. + +Each can default to have one partition, but more can be configured if much higher read +throughput is needed. +The example below shows 1 partition. Replication factor can be any value, but 3 is recommended. + +```bash +kafka-topics --create --zookeeper your-zk:2181/kafka \ + --replication-factor 3 --partitions 1 --topic OryxInput +... +Created topic "OryxInput". +``` + +```bash +kafka-topics --create --zookeeper your-zk:2181/kafka \ + --replication-factor 3 --partitions 1 --topic OryxUpdate +... +Created topic "OryxUpdate". +``` + +You may need to configure the retention time for one or both topics. In particular, +it's typically important to limit the retention time for the update topic, since the Speed +and Serving Layer read the entire topic from the start on startup to catch up. Setting it +to twice the Batch Layer update interval is a good start. For example, to set it to 2 days +(2 * 24 * 60 * 60 * 1000 = 172800000 ms): + +```bash +kafka-topics --zookeeper your-zk:2181/kafka --alter --topic OryxUpdate \ + --config retention.ms=172800000 +``` + +This is not as important for the input topic, which is not re-read from the beginning. + +Continue to [[Running-Oryx]] to start the servers, and run an example. + +# Configuration +Refer to the default configuration file for a list and explanation of configuration parameters: +[`reference.conf`](/OryxProject/oryx/blob/master/framework/oryx-common/src/main/resources/reference.conf) + +Skeleton examples may be found at: + +- [`app/conf/als-example.conf`](/OryxProject/oryx/blob/master/app/conf/als-example.conf) +- [`app/conf/kmeans-example.conf`](/OryxProject/oryx/blob/master/app/conf/kmeans-example.conf) +- [`app/conf/rdf-example.conf`](/OryxProject/oryx/blob/master/app/conf/rdf-example.conf) + +# Running Oryx +_This is a temporary, manual process for distributing and running the binaries._ + +## Running + +Download the [latest release](https://github.com/OryxProject/oryx/releases) of the Oryx Batch, Speed and Serving Layer, both `.jar` files and `.sh` scripts. Alternatively, build them from source (see [[Building-from-Source]]). + +Copy binaries and scripts to machines that are part of the Hadoop cluster. +They may be deployed on different machines, or on one for purposes of testing. +The Speed and Batch Layers should run on at most one machine, each. The Serving Layer +can run on many. + +Create a configuration file for your application. You may start with the example in +[conf/als-example.conf](/OryxProject/oryx/blob/master/app/conf/als-example.conf). Modify +host names, ports and directories. In particular, choose data and model directories on HDFS +that exist and will be accessible to the user running Oryx binaries. + +Copy this config file as `example.conf` to the same directory as binaries and script +on each machine. + +Run the three Layers with: + +```bash +./run.sh --layer-jar oryx-batch-2.0.0-SNAPSHOT.jar --conf example.conf +... +./run.sh --layer-jar oryx-speed-2.0.0-SNAPSHOT.jar --conf example.conf +... +./run.sh --layer-jar oryx-serving-2.0.0-SNAPSHOT.jar --conf example.conf +``` + +These need not be on the same machine, but may be (if configuration specifies different +ports for the Batch and Speed Layer Spark web UI, and the Serving Layer API port). +The Serving Layer may be run on several machines. + +That's all! + +## Trying the ALS Example + +If you've used the configuration above, you are running an instance of the ALS-based +recommender application. + +Obtain the [http://grouplens.org/datasets/movielens/](GroupLens 100k) data set and find the +`u.data` file within. This needs to be converted to csv: + +```bash +tr '\t' ',' < u.data > data.csv +``` + +You may wish to monitor the content of the input and update topic while it is in action. +[[Cluster-Setup]] explains how to tail topics with `kafka-console-consumer`. The topics are +named `OryxInput` and `OryxUpdate` by default. + +Push the input to a Serving Layer, with a local command line tool like `curl`: + +```bash +wget --post-file data.csv \ + --output-document - \ + --header "Content-Type: text/csv" \ + http://your-serving-layer:8080/ingest +``` + +If you are tailing the input topic, you should see a large amount of CSV data flow to the topic: + +``` +196,242,3.0,881250949186 +196,242,3.0,881250949 +186,302,3.0,891717742 +22,377,1.0,878887116 +244,51,2.0,880606923 +166,346,1.0,886397596 +298,474,4.0,884182806 +... +``` + +Soon, you should also see the Batch Layer trigger a new computation. The example configuration +starts one every 5 minutes. + +The data is first written to HDFS. The example configuration has +it written to directories under `hdfs:///user/example/Oryx/data/`. Within are directories +named by timestamp, each containing Hadoop `part-r-*` files, which contain the input as +`SequenceFile`s of `Text`. Although not pure text, printing them should yield some recognizable +data because it is in fact text. + +``` +SEQorg.apache.hadoop.io.Textorg.apache.hadoop.io.Text����^�]�XسN�22,377,1.0,87888711662... +``` + +A model computation then begins. This should show as a number of new distributed jobs the +Batch Layer. Its Spark UI is started at `http://your-batch-layer:4040` in the example +configuration. + +Soon the model will complete, and it will be persisted as a combination of PMML and supporting +data files in a subdirectory of `hdfs:///user/example/Oryx/model/`. For example, the +`model.pmml.gz` files are compressed PMML files containing elements like: + +``` + + +
+ + 2014-12-18T04:48:54-0800 +
+ + + + + + + 56 168 222 343 397 ... + ... +``` + +The `X/` and `Y/` subdirectories next to it contain feature vectors, like: + +``` +[56,[0.5746282834154238,-0.08896614131333057,-0.029456222765775263, + 0.6039821219690552,0.1497901814774658,-0.018654312114339863, + -0.37342063488340266,-0.2370768843521807,1.148260034028485, + 1.0645643656769153]] +[168,[0.8722769882777296,0.4370416943031704,0.27402044461549885, + -0.031252701117490456,-0.7241385753098256,0.026079081002582338, + 0.42050973702065714,0.27766923396205817,0.6241033215856671, + -0.48530795198811266]] +... +``` + +If you are tailing the update topic, you should also see these values published to the +topic. + +The Serving Layer will pick this up soon thereafter, and the `/ready` endpoint will return +status `200 OK`: + +```bash +wget --quiet --output-document - \ + --server-response \ + http://your-serving-layer:8080/ready +... + HTTP/1.1 200 OK + Content-Length: 0 + Date: Thu, 18 Dec 2014 13:26:53 GMT + Server: Oryx +``` + +```bash +wget --quiet --output-document - \ + http://your-serving-layer:8080/recommend/17 +... +50,0.7749542842056966 +275,0.7373013861581563 +258,0.731818692628511 +181,0.7049967175706345 +127,0.704518989947498 +121,0.7014631029793741 +15,0.6954683387287907 +288,0.6774889711024022 +25,0.6663619887033064 +285,0.6398968471343595 +``` + +Congratulations, it's a live recommender! + +When done, all processes can be killed with Ctrl-C safely. + +See more about endpoints that are available in [[API-Endpoint-Reference]]. + +# Making an Oryx App +Oryx comes with an "app tier", implementations of actual Batch, Speed and Serving Layer +logic for recommendation, clustering and classification. However, any implementation +may be used with Oryx. They can be mixed and matched too. For example, you could reimplement +the Batch Layer for ALS-related recommendation and instead supply this alternative +implementation while still using the provided ALS Serving and Speed Layers. + +## Creating an App + +In each case, creating a custom Batch, Speed or Serving Layer app amounts to implementing +one Java interface or Scala trait. These interfaces/traits are found in the `oryx-api` module +within the project. + +| | Java | +| -------:|:--------------------------------------------------- | +| Batch | `com.cloudera.oryx.api.batch.BatchLayerUpdate` | +| Speed | `com.cloudera.oryx.api.speed.SpeedModelManager` | +| Serving | `com.cloudera.oryx.api.serving.ServingModelManager` | + +| | Scala | +| -------:|:-------------------------------------------------------- | +| Batch | `com.cloudera.oryx.api.batch.ScalaBatchLayerUpdate` | +| Speed | `com.cloudera.oryx.api.speed.ScalaSpeedModelManager` | +| Serving | `com.cloudera.oryx.api.serving.ScalaServingModelManager` | + +## Building an App + +To access these interfaces/traits in your application, add a dependency on +`com.cloudera.oryx:oryx-api`. The scope should be `provided`. + +In Maven, this would mean adding a dependency like: + +```XML + + + com.cloudera.oryx + oryx-api + provided + 2.0.0 + + +``` + +A minimal skeleton project can be found at [example/](/OryxProject/oryx/tree/master/app/example). + +Compile your code and create a JAR file containing only your implementation, and any supporting +third-party code. With Maven, this happens with `mvn package`. + +## Customizing an Oryx App + +When deploying the prepackaged applications that come with Oryx, in some cases, it's possible +to supply additional implementations to customize their behavior. For example, the ALS recommender +application exposes a `com.cloudera.oryx.app.als.RescorerProvider` interface. +These app-specific API classes are found in module `oryx-app-api`. Implementations of +interfaces like these can be compiled, packaged and deployed in the same way described +here for stand-alone applications. + +```XML + + + com.cloudera.oryx + oryx-app-api + provided + 2.0.0 + + +``` + +## Deploying an App + +Copy the resulting JAR file -- call it `myapp.jar` -- to the directory containing the +Oryx binary JAR file it will be run with. + +Change your Oryx `.conf` file to refer to your custom Batch, Speed or Serving implementation +class, as appropriate. + +When running the Batch / Speed / Serving Layers, add `--app-jar myapp.jar` to the `run.sh` +command line. + +# Handling Failure + +Eventually, you'll want to stop one or more of the Layers running, or restart it. Or maybe +a server decides to die. What happens then? What's the worst that can happen? + +## Data Loss + +Historical data is saved in HDFS, which should be configured for replication. HDFS ensures +data is stored reliably. Kafka is also designed to cope with failure when configured to use +replication. + +That is, there is nothing special to do here in order to ensure that data is +never completely lost. It is the job of HDFS and Kafka to always be available and not lose +data. + +## Server Failure + +In general, all three Layer server processes should run continuously, and can and should be +restarted immediately if they have to be stopped, or in case of a failure. +This can be accomplished with an init script or similar mechanism (not included, yet). + +### Serving Layer + +The Serving Layer has no state. On startup, it reads all models and updates available on the +update topic. It begins answering queries as soon as any first, valid model is +available. For this reason, it's desirable to limit the retention time for the update topic. + +The operation of the Serving Layer is not distributed. Each instance is independent, and may +stop or start without affecting others. + +### Speed Layer + +The Speed Layer also has no state, and also reads all models and updates available on the +update topic. It begins producing updates as soon as it has a valid model. It also begins +reading from the input topic, and at the moment, always reads from the latest offset. + +The Speed Layer uses Spark Streaming and Spark for some of its computation. Spark has +the responsibility of dealing with failures during computation in the cluster and retrying +tasks. + +Spark Streaming's Kafka integration can in some cases recover from failure of the receiver +that is reading from Kafka. If the entire process dies and is restarted, and `oryx.id` has +been set, then reading will be able to resume from the last offset recorded by Kafka. +(Otherwise, it will resume reading from the latest offset. This means data that arrived +while no Speed Layer was running will not have produced any update.) Also, data that arrives +before the Speed Layer has a model is ignored too. It effectively adopts "at most once" +semantics. + +Because the role of the Speed Layer is to provide an approximate, "best effort" update to the +last published model, this behavior is generally no problem, and desirable because of its +simplicity. + +### Batch Layer + +The Batch Layer is the most complex, since it does generate some state: + +- Historical data, is always persisted to HDFS +- If the app chooses to, additional state like models can be persisted to HDFS as well as topics + +It also is most sensitive to reading data multiple times or not at all, since it is the component +that creates the "official" next model. + +As with the Speed Layer, Spark and Spark Streaming handles many of the failure scenarios during +computation. It also manages storing data to HDFS and is responsible for avoiding writing the +same data twice. + +Applications are responsible for recovering their own 'state'; currently, +applications built on the Oryx ML tier write state into unique subdirectories, +and will simply produce a new set of state in a new directory when restarted. +Previous state, if it exists, will have been completely written or not at all. + +The Batch Layer also currently adopts the same "at most once" semantics as the Speed Layer. +As above, if the entire process dies and is restarted, and `oryx.id` has +been set, then reading will be able to resume from the last offset recorded by Kafka, +and otherwise, it will resume reading from the latest offset. + diff --git a/src/site/markdown/docs/oryx-1-diff.md b/src/site/markdown/docs/oryx-1-diff.md new file mode 100644 index 000000000..c1c84b8d9 --- /dev/null +++ b/src/site/markdown/docs/oryx-1-diff.md @@ -0,0 +1,72 @@ +In broad terms, Oryx 2 is more extensible and scalable, and built on more modern technology than Oryx 1, which began from a design 4 years old now. It is however, a more complex system to deploy and operate. This page highlights some key differences. + +# Oryx 2 Design Goals + +1. Provide a more reusable platform for [lambda-architecture](http://lambda-architecture.net/)-style +designs, with batch, speed and serving layers +1. Make each layer usable independently +1. Better support for common machine learning needs + - Test/train set split and evaluation + - Parallel model build + - Hyper-parameter selection +1. Use newer technologies like Spark and Streaming in order to simplify: + - Remove separate in-core implementations for scale-down + - Remove custom data transport implementation in favor of [Apache Kafka](http://kafka.apache.org/) + - Use a 'real' streaming framework instead of reimplementing a simple one + - Remove complex MapReduce-based implementations in favor of + [Apache Spark](http://spark.apache.org/)-based implementations +1. Support more input (i.e. not just [CSV](http://en.wikipedia.org/wiki/Comma-separated_values)) + +# Key Differences + +## Architecture + +| Oryx 1 | Oryx 2 | +| ------ | ------ | +| One monolithic "tier" for lambda architecture and apps | Three tiers: lambda, ML, apps | +| No app-level extensibility | Platform for building other lambda- and ML-based apps | +| Two layers: Computation and Serving | Three layers: Batch, Speed and Serving | +| Based on Crunch, MapReduce, HDFS, Tomcat | Based on HDFS, YARN, Spark (+ Streaming, MLlib), Kafka, Zookeeper, Tomcat | +| 27K lines production code / 4K test | 10K lines production code / 7.5K test: simpler, better tested | + +## Deployment + +| Oryx 1 | Oryx 2 | +| ------ | ------ | +| Requires Java 6, optionally core Hadoop 2.2+ (including "MR1") | Requires Java 7, core Hadoop 2.5+ (YARN, not "MR1") Spark 1.3+, Kafka 0.8.2+, Zookeeper 3.4.5+ | +| Supports local, non-Hadoop deployment | No non-Hadoop deployment | +| Supports MapReduce-based Hadoop deployment | Supports only deployment with core Hadoop, YARN, Spark, Kafka | + +## Scale and Reliability + +| Oryx 1 | Oryx 2 | +| ------ | ------ | +| Memory-efficient | Fast, memory-hungry | +| Custom, best-effort data transport between layers | Reliable data transport via Kafka | +| Custom MapReduce-based algorithm implementations in Computation Layer | Spark Streaming-based batch layer framework and Spark MLlib-based algorithm implementations | +| Custom in-core incremental model update ("speed layer") | Spark Streaming-based distributed model update | + +# Migration Guide + +The bad news is that no direct migration is possible between Oryx 1 and Oryx 2; they have very different implementations. However, differences in the user- and developer-facing aspects are by design similar or identical. + +## REST API + +Oryx 2 contains the same set of end-to-end ML applications as Oryx 1, and exposes virtually the same REST API, unchanged. The only significant difference is that there is no longer a `/refresh` endpoint, because it is unnecessary. + +## Configuration + +Both implementations use a single configuration file parsed by Typesafe Config. The property namespaces are different but there are some similarities. Compare the [Oryx 1 configuration](https://github.com/cloudera/oryx/blob/master/common/src/main/resources/reference.conf) to the [Oryx 2 configuration](https://github.com/OryxProject/oryx/blob/master/oryx-common/src/main/resources/reference.conf) to understand some of the correspondence and difference. + +## Data Storage and Transport + +In Oryx 1, all data was stored in a series of directories in HDFS. In Oryx 2, data is transported via Kafka (which ultimately stores data in HDFS) and in HDFS as managed by a Spark Streaming process. Although it is still possible to side-load data files via HDFS in Oryx 2, it is not supported and is discouraged, in favor of sending data directly to a Kafka queue. + +## Data Formats + +In theory, the framework is agnostic to data types and encodings passed between layers. In practice, the provided applications consume the same CSV-encoded data format as Oryx 1. + +## Deployment + +The deployment requirements are the most different. Although all layers are still distributed as Java `.jar` binaries, now, a Hadoop cluster is required, including HDFS, YARN, Kafka, Spark, and Zookeeper services. Your environment or cluster must be updated to include these services before you can use Oryx 2. + diff --git a/src/site/markdown/docs/oryx-2-first-release.md b/src/site/markdown/docs/oryx-2-first-release.md new file mode 100644 index 000000000..1a8e2fa68 --- /dev/null +++ b/src/site/markdown/docs/oryx-2-first-release.md @@ -0,0 +1,24 @@ +At last! The first alpha release of Oryx 2 is now available. + +Oryx 2 is a realization of the lambda architecture built on Apache Spark and Apache Kafka, but with specialization for real-time large scale machine learning. It is a framework for building applications, but also includes packaged, end-to-end applications for collaborative filtering, classification, regression and clustering. + +What's different from Oryx 1? +https://github.com/OryxProject/oryx/wiki/Differences-From-Oryx-1 + +Grab releases at: +https://github.com/OryxProject/oryx/releases + +Get started with cluster setup, and running an example: +https://github.com/OryxProject/oryx/wiki/Cluster-Setup +https://github.com/OryxProject/oryx/wiki/Running-Oryx + +We need your help! Did it work? Did you encounter problems? What do you think of the deployment? What features would you like to see? + +Issues: +https://github.com/OryxProject/oryx/issues + +Questions and discussion: +https://community.cloudera.com/t5/Data-Science-and-Machine/bd-p/Mahout +http://stackoverflow.com/questions/tagged/oryx + +Much more to come! Thank you to all users and contributors. \ No newline at end of file diff --git a/src/site/markdown/download.md b/src/site/markdown/download.md new file mode 100644 index 000000000..bd0f4552a --- /dev/null +++ b/src/site/markdown/download.md @@ -0,0 +1,4 @@ +# Download + +* [Download](https://github.com/OryxProject/oryx/) +* [License](./license.html) diff --git a/src/site/markdown/index.md b/src/site/markdown/index.md new file mode 100644 index 000000000..f5b974025 --- /dev/null +++ b/src/site/markdown/index.md @@ -0,0 +1,40 @@ + + +At last! The first alpha release of Oryx 2 is now available. + +Oryx 2 is a realization of the lambda architecture built on Apache Spark and Apache Kafka, but +with specialization for real-time large scale machine learning. It is a framework for building +applications, but also includes packaged, end-to-end applications for collaborative filtering, +classification, regression and clustering. + +# What's different from Oryx 1? +[Differences from Oryx 1](https://github.com/OryxProject/oryx/wiki/Differences-From-Oryx-1) + +Grab releases at: +[Releases](https://github.com/OryxProject/oryx/releases) +or depend on the artifact using maven coordinates: + + + com.cloudera.oryx + ${oryx.artifactID} + 2.0.0-alpha-1 + + +Get started with cluster setup, and running an example: +[Cluster Setup](https://github.com/OryxProject/oryx/wiki/Cluster-Setup) +[Running Oryx](https://github.com/OryxProject/oryx/wiki/Running-Oryx) + +We need your help! Did it work? Did you encounter problems? What do you think of the deployment? +What features would you like to see? + +#Development + +## Issues +[Github Issues](https://github.com/OryxProject/oryx/issues) + +## Questions and discussion + +- [Cloudera Data Science and Machine Learning](https://community.cloudera.com/t5/Data-Science-and-Machine/bd-p/Mahout) +- [Stack Overflow Qs Tagged with Oryx](http://stackoverflow.com/questions/tagged/oryx) + +Much more to come! Thank you to all users and contributors. \ No newline at end of file diff --git a/src/site/markdown/overview.md b/src/site/markdown/overview.md new file mode 100644 index 000000000..f9948a69f --- /dev/null +++ b/src/site/markdown/overview.md @@ -0,0 +1,130 @@ +# Overview +Oryx 2 is a realization of the lambda architecture built on [Apache Spark](http://spark.apache.org) +and [Apache Kafka](http://kafka.apache.org), but with specialization for real-time large scale machine +learning. It is a framework for building applications, but also includes packaged, end-to-end +applications for collaborative filtering, classification, regression and clustering. + +Oryx 2 is a rearchitecting and continuation of the original [Oryx 1](http://github.com/cloudera/oryx) +project. The [Differences from Oryx 1 wiki](https://github +.com/OryxProject/oryx/wiki/Differences-From-Oryx-1) describes changes. + +It consists of three _tiers_, each of which builds on the one below: + +1. A generic lambda architecture tier, providing batch/speed/serving layers, which is not +specific to machine learning +1. A specialization on top providing ML abstractions for hyperparameter selection, etc. +1. An end-to-end implementation of the same standard ML algorithms as an application +([ALS](labs.yahoo.com/files/HuKorenVolinsky-ICDM08.pdf), +[random decision forests](http://en.wikipedia.org/wiki/Random_forest), +[k-means](http://en.wikipedia.org/wiki/K-means_clustering)) on top + +Viewed another way, it contains the three side-by-side cooperating _layers_ of the lambda +architecture too, as well as a connecting element: + +1. A *Batch Layer*, which computes a new "result" (think model, but, could be anything) +as a function of all historical data, and the previous result. This may be a long-running operation +which takes hours, and runs a few times a day for example. +2. A *Speed Layer*, which produces and publishes incremental model updates from a +stream of new data. These updates are intended to happen on the order of seconds. +3. A *Serving Layer*, which receives models and updates and implements a synchronous API exposing +query operations on the result. +4. A data transport layer, which moves data between layers and receives input from external sources + +The project may be reused tier by tier: for example, the packaged app tier can be ignored, and it +can be a framework for building new ML applications. It can be reused layer by layer too: +for example, the Speed Layer can be omitted if a deployment does not need incremental updates. +It can be modified piece-by-piece too: the collaborative filtering application's model-building +batch layer could be swapped for a custom implementation based on a new algorithm outside +Spark MLlib while retaining the serving and speed layer implementations. + + + +## Module Mapping + +Major modules and their relation to tiers and layers: + +| | *Serving* | *Speed* | *Batch* | +| --------:| --------------------- | --------------------------- | --------------------------- | +| *Binary* | `oryx-serving` | `oryx-speed` | `oryx-batch` | +| *App* | `oryx-app-serving` | `oryx-app-mllib` `oryx-app` | `oryx-app-mllib` `oryx-app` | +| *ML* | | `oryx-ml` | `oryx-ml` | +| *Lambda* | `oryx-lambda-serving` | `oryx-lambda` | `oryx-lambda` | + +Supporting modules like `oryx-common`, `oryx-app-common`, `oryx-api`, `oryx-app-api` are not shown. + +# Lambda Tier Implementation + +## Data transport + +The data transport mechanism is an [Apache Kafka](http://kafka.apache.org/) topic. +Any process -- including but not limited to the serving layer -- can put data onto the topic, +to be seen by the speed and batch layers. Kafka topics are also used to publish both +*models* and *model updates*, for consumption by the speed and serving layers. + +## Batch Layer + +The batch layer is implemented as a [Spark Streaming](http://spark.apache.org/streaming/) +process on a Hadoop cluster, which reads data from the input Kafka topic. The Streaming process +necessarily has a very long period -- hours or even a day. It uses Spark to save the +current window of data to HDFS, and then combine with all historical data on HDFS, and +initiate building of a new result. The result is written to HDFS, and, also published +to a Kafka update topic. + +## Speed Layer + +The speed layer is implemented as a Spark Streaming process as well, which also listens for +data from the input Kafka topic. It has a much shorter period, on the order of seconds. +It periodically loads a new model from the update topic and continually produces model updates. +These are put back onto the update topic too. + +## Serving Layer + +The serving layer listens for model and model updates on the update topic. It maintains model +state in memory. It exposes an HTTP +[REST](http://en.wikipedia.org/wiki/Representational_state_transfer) API on top of methods +that query the model in memory. Many of these may be deployed for scale. Each may +also accept new data and write it to Kafka where it can be seen by all Speed layers. +The different layer defaults are provided by a `reference.conf`. Applications are expected to provide their +own conf file to specify properties that determine what application resources are required. This can be provided +using an `application.conf` available in the classpath (which Typesafe Config loads automatically) or +by providing a command line argument which specifies a config file to load +(e.g. `-Dconfig.file=/cfg.conf`) or even by specifying individual properties on the command line +(e.g. `-D=`). + +## Usage and Deployment + +The application is written in Java, using Spark 1.2.x+, +[Hadoop](http://hadoop.apache.org/) 2.5.x+, [Tomcat](http://tomcat.apache.org/) 8.x+, +Kafka 0.8.2+, [Zookeeper](http://zookeeper.apache.org/) and more. Configuration uses a single +[Typesafe Config](https://github.com/typesafehub/config) config file, wherein +applications configure an entire deployment of the system. This includes implementations of +key interface classes which implement the batch, speed, and serving logic. Applications +package and deploy their implementations with each instance of the layer binaries. Each +of these is a runnable Java `.jar` which starts all necessary services. + +# ML Tier Implementation + +The ML tier is simply an implementation and specialization of the generic interfaces mentioned +above, which implement common ML needs and then expose a different ML-specific interface for +applications to fill in. + +For example, it implements a batch layer update process that selects a test and training set +automatically. It calls an app-supplied function to evaluate the model on the test set. +It can automatically repeat this, and with different hyperparameter values, choosing the best +result. It manages serialization of the model via +[PMML](http://www.dmg.org/v4-2-1/GeneralStructure.html). + +# End-to-end Application Implementation + +In addition to being a framework, Oryx 2 contains complete implementations of the batch, speed and +serving layer for three machine learning use cases. These are ready to deploy out-of-the-box, or to be +used as the basis for a custom application: + +- Collaborative filtering / recommendation based on Alternating Least Squares +- Clustering based on k-means +- Classification and regression based on random decision forests + +The REST API endpoints provided by the serving layer application implementations is documented in the +[API Endpoint Reference](https://github.com/OryxProject/oryx/wiki/API-Endpoint-Reference) wiki. +Example configuration that enables these applications is available in the +[Configuration Reference](https://github.com/OryxProject/oryx/wiki/Configuration-Reference) wiki. diff --git a/src/site/site.xml b/src/site/site.xml index 65153e2f9..992b0c25d 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -19,14 +19,38 @@ name="${project.name}"> - org.apache.maven.skins - maven-fluido-skin - 1.3.1 + lt.velykis.maven.skins + reflow-maven-skin + 1.1.1 + + Oryx + http://oryxproject.github.io/oryx/ + + + + + side + 6 + + Oryx 2 + index.html + + bootswatch-united + sidebar + + + + - - + + + + + + +