Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
179 lines (115 sloc) 17 KB
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>README</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>
/*
This document has been created with Marked.app <http://markedapp.com>, Copyright 2011 Brett Terpstra
Please leave this notice in place, along with any additional credits below.
---------------------------------------------------------------
Title: GitHub
Author: Brett Terpstra
Description: Github README style. Includes theme for Pygmentized code blocks.
*/
html,body{color:black}*{margin:0;padding:0}body{font:13.34px helvetica,arial,freesans,clean,sans-serif;-webkit-font-smoothing:antialiased;line-height:1.4;padding:3px;background:#fff;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px}p{margin:1em 0}a{color:#4183c4;text-decoration:none}#wrapper{background-color:#fff;border:3px solid #eee!important;padding:0 30px;margin:15px}#wrapper{font-size:14px;line-height:1.6}#wrapper>*:first-child{margin-top:0!important}#wrapper>*:last-child{margin-bottom:0!important}h1,h2,h3,h4,h5,h6{margin:0;padding:0}h1{margin:15px 0;padding-bottom:2px;font-size:24px;border-bottom:1px solid #eee}h2{margin:20px 0 10px 0;font-size:18px}h3{margin:20px 0 10px 0;padding-bottom:2px;font-size:14px;border-bottom:1px solid #ddd}h4{font-size:14px;line-height:26px;padding:18px 0 4px;font-weight:bold;text-transform:uppercase}h5{font-size:13px;line-height:26px;padding:14px 0 0;font-weight:bold;text-transform:uppercase}h6{color:#666;font-size:14px;line-height:26px;padding:18px 0 0;font-weight:normal;font-variant:italic}hr{background:transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAYAAAAECAYAAACtBE5DAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMC1jMDYwIDYxLjEzNDc3NywgMjAxMC8wMi8xMi0xNzozMjowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNSBNYWNpbnRvc2giIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6OENDRjNBN0E2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6OENDRjNBN0I2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo4Q0NGM0E3ODY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo4Q0NGM0E3OTY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PqqezsUAAAAfSURBVHjaYmRABcYwBiM2QSA4y4hNEKYDQxAEAAIMAHNGAzhkPOlYAAAAAElFTkSuQmCC) repeat-x 0 0;border:0 none;color:#ccc;height:4px;margin:20px 0;padding:0}#wrapper>h2:first-child,#wrapper>h1:first-child,#wrapper>h1:first-child+h2{border:0;margin:0;padding:0}#wrapper>h3:first-child,#wrapper>h4:first-child,#wrapper>h5:first-child,#wrapper>h6:first-child{margin:0;padding:0}h4+p,h5+p,h6+p{margin-top:0}li p.first{display:inline-block}ul,ol{margin:15px 0 15px 25px}ul li,ol li{margin-top:7px;margin-bottom:7px}ul li>*:last-child,ol li>*:last-child{margin-bottom:0}ul li>*:first-child,ol li>*:first-child{margin-top:0}#wrapper>ul,#wrapper>ol{margin-top:21px;margin-left:36px}dl{margin:0;padding:20px 0 0}dl dt{font-size:14px;font-weight:bold;line-height:normal;margin:0;padding:20px 0 0}dl dt:first-child{padding:0}dl dd{font-size:13px;margin:0;padding:3px 0 0}blockquote{margin:14px 0;border-left:4px solid #ddd;padding-left:11px;color:#555}table{border-collapse:collapse;margin:20px 0 0;padding:0}table tr{border-top:1px solid #ccc;background-color:#fff;margin:0;padding:0}table tr:nth-child(2n){background-color:#f8f8f8}table tr th,table tr td{border:1px solid #ccc;text-align:left;margin:0;padding:6px 13px}img{max-width:100%;height:auto}code,tt{margin:0 2px;padding:2px 5px;white-space:nowrap;border:1px solid #ccc;background-color:#f8f8f8;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px;font-size:12px}pre>code{margin:0;padding:0;white-space:pre;border:0;background:transparent;font-size:13px}.highlight pre,pre{background-color:#f8f8f8;border:1px solid #ccc;font-size:13px;line-height:19px;overflow:auto;padding:6px 10px;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px}#wrapper>pre,#wrapper>div.highlight{margin:10px 0 0}pre code,pre tt{background-color:transparent;border:0}#wrapper{background-color:#fff;border:1px solid #cacaca;padding:30px}.poetry pre{font-family:Georgia,Garamond,serif!important;font-style:italic;font-size:110%!important;line-height:1.6em;display:block;margin-left:1em}.poetry pre code{font-family:Georgia,Garamond,serif!important}sup,sub,a.footnote{font-size:1.4ex;height:0;line-height:1;vertical-align:super;position:relative}sub{vertical-align:sub;top:-1px}@media print{body{background:#fff}img,pre,blockquote,table,figure{page-break-inside:avoid}#wrapper{background:#fff;border:0}code{background-color:#fff;color:#444!important;padding:0 .2em;border:1px solid #dedede}pre code{background-color:#fff!important;overflow:visible}pre{background:#fff}}@media screen{body.inverted,.inverted #wrapper,.inverted hr .inverted p,.inverted td,.inverted li,.inverted h1,.inverted h2,.inverted h3,.inverted h4,.inverted h5,.inverted h6,.inverted th,.inverted .math,.inverted caption,.inverted dd,.inverted dt,.inverted blockquote{color:#eee!important;border-color:#555}.inverted td,.inverted th{background:#333}.inverted pre,.inverted code,.inverted tt{background:#444!important}.inverted h2{border-color:#555}.inverted hr{border-color:#777;border-width:1px!important}::selection{background:rgba(157,193,200,.5)}h1::selection{background-color:rgba(45,156,208,.3)}h2::selection{background-color:rgba(90,182,224,.3)}h3::selection,h4::selection,h5::selection,h6::selection,li::selection,ol::selection{background-color:rgba(133,201,232,.3)}code::selection{background-color:rgba(0,0,0,.7);color:#eee}code span::selection{background-color:rgba(0,0,0,.7)!important;color:#eee!important}a::selection{background-color:rgba(255,230,102,.2)}.inverted a::selection{background-color:rgba(255,230,102,.6)}td::selection,th::selection,caption::selection{background-color:rgba(180,237,95,.5)}.inverted{background:#0b2531}.inverted #wrapper,.inverted{background:rgba(37,42,42,1)}.inverted a{color:rgba(172,209,213,1)}}.highlight .c{color:#998;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .k{font-weight:bold}.highlight .o{font-weight:bold}.highlight .cm{color:#998;font-style:italic}.highlight .cp{color:#999;font-weight:bold}.highlight .c1{color:#998;font-style:italic}.highlight .cs{color:#999;font-weight:bold;font-style:italic}.highlight .gd{color:#000;background-color:#fdd}.highlight .gd .x{color:#000;background-color:#faa}.highlight .ge{font-style:italic}.highlight .gr{color:#a00}.highlight .gh{color:#999}.highlight .gi{color:#000;background-color:#dfd}.highlight .gi .x{color:#000;background-color:#afa}.highlight .go{color:#888}.highlight .gp{color:#555}.highlight .gs{font-weight:bold}.highlight .gu{color:#800080;font-weight:bold}.highlight .gt{color:#a00}.highlight .kc{font-weight:bold}.highlight .kd{font-weight:bold}.highlight .kn{font-weight:bold}.highlight .kp{font-weight:bold}.highlight .kr{font-weight:bold}.highlight .kt{color:#458;font-weight:bold}.highlight .m{color:#099}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .nb{color:#0086b3}.highlight .nc{color:#458;font-weight:bold}.highlight .no{color:#008080}.highlight .ni{color:#800080}.highlight .ne{color:#900;font-weight:bold}.highlight .nf{color:#900;font-weight:bold}.highlight .nn{color:#555}.highlight .nt{color:#000080}.highlight .nv{color:#008080}.highlight .ow{font-weight:bold}.highlight .w{color:#bbb}.highlight .mf{color:#099}.highlight .mh{color:#099}.highlight .mi{color:#099}.highlight .mo{color:#099}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .bp{color:#999}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .il{color:#099}.highlight .gc{color:#999;background-color:#eaf2f5}.type-csharp .highlight .k{color:#00F}.type-csharp .highlight .kt{color:#00F}.type-csharp .highlight .nf{color:#000;font-weight:normal}.type-csharp .highlight .nc{color:#2b91af}.type-csharp .highlight .nn{color:#000}.type-csharp .highlight .s{color:#a31515}.type-csharp .highlight .sc{color:#a31515}
</style>
</head>
<body class="normal">
<div id="wrapper">
<h1 id="scaldingworkshopreadme">Scalding Workshop README</h1>
<p><em>Copyright (C) 2010-2014 Think Big Analytics, Inc. All Rights Reserved.</em></p>
<p><strong>StrangeLoop 2012</strong><br/>
<strong>Dean Wampler, Think Big Analytics</strong><br/>
<a href="&#109;&#x61;&#105;&#108;&#x74;&#111;&#x3a;&#100;&#x65;&#97;&#110;&#64;&#100;&#101;&#97;&#x6e;&#119;&#97;&#x6d;&#112;&#x6c;&#x65;&#114;&#x2e;&#x63;&#x6f;&#109;">&#x64;&#x65;&#x61;&#x6e;&#x40;&#100;&#101;&#97;&#110;&#119;&#97;&#x6d;&#x70;&#108;&#x65;&#x72;&#46;&#99;&#x6f;&#x6d;</a><br/>
<a href="https://twitter.com/deanwampler">@deanwampler</a><br/>
<a href="http://thinkbiganalytics.com">Hire Us!</a></p>
<h2 id="aboutthisworkshop">About this Workshop</h2>
<p>This workshop is a half-day tutorial on Scalding and its place in the Hadoop ecosystem. <a href="https://github.com/twitter/scalding">Scalding</a> is a Scala API developed at Twitter for distributed data programming that uses the <a href="http://www.cascading.org/">Cascading</a> Java API, which in turn sits on top of Hadoop&#8217;s Java API. However, Scalding, through Cascading, also offers a <em>local</em> mode that makes it easy to run jobs without using the Hadoop libraries, for simpler testing and learning. We&#8217;ll use this feature for most of this workshop.</p>
<h2 id="gettingstarted">Getting Started</h2>
<p>To keep the setup process as simple as possible, the workshop git repo contains a pre-built jar that bundles Scalding v0.7.3 for Scala v2.9.2 and other required jars, such as <code>Cascading</code>, <code>Hadoop</code> <em>core</em>, <code>Log4J</code>, etc. So, all you need to install is Java, Scala, Ruby, and this workshop.</p>
<p>It helps to pick a work directory where you will install some of the packages. In what follows, we&#8217;ll assume you&#8217;re using <code>$HOME/fun</code> on Linux, Mac OSX, or Cygwin for Windows with the <code>bash</code> shell (or a similar shell) or you are using <code>C:\fun</code> on Windows.</p>
<h3 id="git">Git</h3>
<p>You&#8217;ll need git to clone the workshop repository and optionally for other installs. See <a href="http://git-scm.com/book/en/Getting-Started-Installing-Git">here</a> for details. As an alternative, you can download a workshop release from its Github repo, rather than clone it.</p>
<h3 id="thisworkshop">This Workshop</h3>
<p>Download or clone this <a href="https://github.com/thinkbiganalytics/scalding-workshop">workshop from GitHub</a>.</p>
<p>To clone this workshop from GitHub using <code>bash</code>:</p>
<pre><code>cd $HOME/fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<p>Or, simply <a href="https://github.com/ThinkBigAnalytics/scalding-workshop/downloads">download a release</a>.</p>
<h3 id="javav1.6orbetter">Java v1.6 or Better</h3>
<p>Install Java if necessary from <a href="http://www.java.com/en/download/help/download_options.xml">here</a>.</p>
<h3 id="scalav2.9.2">Scala v2.9.2</h3>
<p>Scalding uses Scala v2.9.2. Install it from <a href="http://www.scala-lang.org/downloads">here</a>.</p>
<h3 id="rubyv1.8.7orv1.9.x">Ruby v1.8.7 or v1.9.X</h3>
<p>Ruby is used as a platform-independent language for driver scripts by Scalding and we&#8217;ve followed the same convention. See <a href="http://ruby-lang.org">ruby-lang.org</a> for details on installing Ruby. Either version 1.8.7 or 1.9.X will work.</p>
<h2 id="sanitycheck">Sanity Check</h2>
<p>Once you&#8217;ve completed these steps, run the following commands as a sanity check to ensure that everything is setup properly. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding-workshop
./run.rb scripts/SanityCheck0.scala
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding-workshop
ruby run.rb scripts/SanityCheck0.scala
</code></pre>
<p>The commands should run without error. Note that it takes a moment to compile the Scala script and run to completion. The output is written to <code>output/SanityCheck0.txt</code>. What&#8217;s in that file?</p>
<h2 id="optionalinstalls">Optional Installs</h2>
<p>If you&#8217;re serious about using Scalding, you should clone and build the Scalding repo. We&#8217;ll talk briefly about it in the workshop, but it isn&#8217;t required.</p>
<h3 id="sbtv0.11">SBT v0.11</h3>
<p>SBT is the <em>de facto</em> build tool for Scala. You&#8217;ll need it to build Scalding. Follow these <a href="https://github.com/harrah/xsbt/wiki/Getting-Started-Setup">installation instructions</a>.</p>
<h3 id="scaldingfromgithub">Scalding from GitHub</h3>
<p>Clone <a href="https://github.com/twitter/scalding">Scalding from GitHub</a>. Using <code>bash</code>:</p>
<pre><code>cd $HOME/fun
git clone https://github.com/twitter/scalding.git
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<h3 id="buildscalding">Build Scalding</h3>
<p>Build Scalding according to its <a href="https://github.com/twitter/scalding/wiki/Getting-Started">Getting Started</a> page. Here is a synopsis of the steps. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding
sbt update
sbt assembly
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding
sbt update
sbt assembly
</code></pre>
<p>(The Getting Started page says to build the <code>test</code> target between <code>update</code> and <code>assembly</code>, but the later builds <code>test</code> itself.)</p>
<h3 id="sanitycheck">Sanity Check</h3>
<p>Once you&#8217;ve built Scalding, run the following command as a sanity check to ensure everything is setup properly. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding
scripts/scald.rb --local tutorial/Tutorial0.scala
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding
ruby scripts\scald.rb --local tutorial/Tutorial0.scala
</code></pre>
<h2 id="nextsteps">Next Steps</h2>
<p>The Workshop/Tutorial proper is described in the companion <a href="https://github.com/thinkbiganalytics/scalding-workshop/blob/master/Workshop.html">Workshop document</a>.</p>
<h2 id="notesonreleases">Notes on Releases</h2>
<h3 id="v0.2.1">V0.2.1</h3>
<p>Added missing file to distribution. Refined the run scripts to work better with different Java versions.</p>
<h3 id="v0.2">V0.2</h3>
<p>Refined several exercises and fixed bugs. Added <code>Makefile</code> for building releases.</p>
<h3 id="v0.1">V0.1</h3>
<p>First release for StrangeLoop 2012 workshop.</p>
<h2 id="forfurtherinformation">For Further Information</h2>
<p>See the <a href="https://github.com/twitter/scalding">Scalding GitHub page</a> for more information about Scalding. The <a href="https://github.com/twitter/scalding/wiki">wiki</a> is very useful.</p>
<p><a href="&#109;&#x61;&#x69;&#108;&#116;&#x6f;&#58;&#x64;&#x65;&#x61;&#x6e;&#64;&#100;&#x65;&#97;&#x6e;&#119;&#x61;&#x6d;&#112;&#x6c;&#101;&#x72;&#x2e;&#x63;&#x6f;&#x6d;">&#x44;&#101;&#x61;&#x6e; &#87;&#x61;&#x6d;&#112;&#108;&#x65;&#114;</a> from <a href="http://thinkbiganalytics.com">Think Big Analytics</a> prepared this workshop. <a href="&#109;&#x61;&#x69;&#108;&#x74;&#111;&#x3a;&#100;&#101;&#97;&#110;&#x40;&#100;&#x65;&#97;&#x6e;&#x77;&#x61;&#x6d;&#112;&#108;&#101;&#x72;&#x2e;&#99;&#111;&#x6d;">&#67;&#111;&#110;&#x74;&#x61;&#x63;&#116; &#68;&#101;&#x61;&#110;</a> with questions about the workshop. For information about consulting and training on Scalding and other Hadoop-related topics, <a href="&#x6d;&#97;&#105;&#x6c;&#x74;&#111;&#58;&#105;&#x6e;&#x66;&#111;&#x40;&#116;&#x68;&#105;&#110;&#x6b;&#x62;&#105;&#103;&#97;&#x6e;&#x61;&#x6c;&#x79;&#116;&#x69;&#99;&#x73;&#x2e;&#x63;&#x6f;&#109;">&#115;&#101;&#x6e;&#x64; &#x75;&#115; &#101;&#109;&#97;&#105;&#108;</a>.</p>
<p>Some of the data used in these exercises was obtained from <a href="http://infochimps.com">InfoChimps</a>.</p>
<p><strong>Dean Wampler</strong><br/>
<a href="&#x6d;&#x61;&#x69;&#x6c;&#x74;&#x6f;&#x3a;&#x64;&#101;&#97;&#x6e;&#x40;&#100;&#x65;&#97;&#x6e;&#119;&#x61;&#109;&#x70;&#108;&#x65;&#114;&#46;&#x63;&#x6f;&#109;">&#100;&#x65;&#97;&#110;&#x40;&#x64;&#x65;&#97;&#110;&#119;&#97;&#x6d;&#112;&#x6c;&#101;&#x72;&#x2e;&#99;&#111;&#109;</a><br/>
<a href="https://twitter.com/deanwampler">@deanwampler</a><br/></p>
</div>
</body>
</html>