Let's first insert some data in the HDFS

In [1]:
!hdfs dfs -mkdir -p /datasets
!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \
    -O ../datasets/shakespeare_all.txt
!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt
!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt
!hdfs dfs -ls /datasets

Found 2 items
-rw-r--r--   1 vagrant supergroup       1365 2016-05-10 19:58 /datasets/hadoop_git_readme.txt
-rw-r--r--   1 vagrant supergroup    5589889 2016-05-10 19:58 /datasets/shakespeare_all.txt


## MR with Hadoop streaming

In [2]:
with open('mapper_hadoop.py', 'w') as fh:
    fh.write("""#!/usr/bin/env python

import sys

for line in sys.stdin:
    print "chars", len(line.rstrip('\\n'))
    print "words", len(line.split())
    print "lines", 1
    """)


with open('reducer_hadoop.py', 'w') as fh:
    fh.write("""#!/usr/bin/env python

import sys

counts = {"chars": 0, "words":0, "lines":0}

for line in sys.stdin:
    kv = line.rstrip().split()
    counts[kv[0]] += int(kv[1])

for k,v in counts.items():
    print k, v
    """) 

In [3]:
!chmod a+x *_hadoop.py

In [4]:
!cat ../datasets/hadoop_git_readme.txt | ./mapper_hadoop.py | sort -k1,1 | ./reducer_hadoop.py

chars 1335
lines 31
words 179


In [5]:
!hdfs dfs -mkdir -p /tmp
!hdfs dfs -rm -f -r /tmp/mr.out

!hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar \
-files mapper_hadoop.py,reducer_hadoop.py \
-mapper mapper_hadoop.py -reducer reducer_hadoop.py \
-input /datasets/hadoop_git_readme.txt -output /tmp/mr.out



16/05/10 19:58:48 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /tmp/mr.out
packageJobJar: [/tmp/hadoop-unjar5384590696382062055/] [] /tmp/streamjob1965588122940844531.jar tmpDir=null
16/05/10 19:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
16/05/10 19:58:51 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
16/05/10 19:58:51 INFO mapred.FileInputFormat: Total input paths to process : 1
16/05/10 19:58:51 INFO mapreduce.JobSubmitter: number of splits:2
16/05/10 19:58:52 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1462906052477_0019
16/05/10 19:58:52 INFO impl.YarnClientImpl: Submitted application application_1462906052477_0019
16/05/10 19:58:52 INFO mapreduce.Job: The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0019/
16/05/10 19:58:52 INFO mapreduce.Job: Running job: job_1462906052477_0019
16/05/10 19:58:58 INFO map

In [6]:
!hdfs dfs -ls /tmp/mr.out

Found 2 items
-rw-r--r--   1 vagrant supergroup          0 2016-05-10 19:59 /tmp/mr.out/_SUCCESS
-rw-r--r--   1 vagrant supergroup         33 2016-05-10 19:59 /tmp/mr.out/part-00000


In [7]:
!hdfs dfs -cat /tmp/mr.out/part-00000

chars 1335	
lines 31	
words 179	


## MR with Python MrJob library

In [8]:
with open("MrJob_job1.py", "w") as fh:
    fh.write("""
from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()    
    """)

In [9]:
!python MrJob_job1.py ../datasets/hadoop_git_readme.txt

No configs found; falling back on auto-configuration
Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984
Running step 1 of 1...
Streaming final output from /tmp/MrJob_job1.vagrant.20160510.195920.590984/output...
"chars"	1335
"lines"	31
"words"	179
Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984...


In [10]:
!python MrJob_job1.py -r hadoop hdfs:///datasets/hadoop_git_readme.txt

No configs found; falling back on auto-configuration
Looking for hadoop binary in /usr/local/hadoop/bin...
Found hadoop binary: /usr/local/hadoop/bin/hadoop
Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616
Using Hadoop version 2.6.4
Copying local files to hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/files/...
Looking for Hadoop streaming jar in /usr/local/hadoop...
Found Hadoop streaming jar: /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar
Running step 1 of 1...
  packageJobJar: [/tmp/hadoop-unjar7634308048659876233/] [] /tmp/streamjob5879999650692493094.jar tmpDir=null
  Connecting to ResourceManager at /0.0.0.0:8032
  Connecting to ResourceManager at /0.0.0.0:8032
  Total input paths to process : 1
  number of splits:2
  Submitting tokens for job: job_1462906052477_0020
  Submitted application application_1462906052477_0020
  The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0020/
  Run

In [11]:
with open("MrJob_job2.py", "w") as fh:
    fh.write("""
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   reducer=self.reducer_count_words),
            MRStep(mapper=self.mapper_word_count_one_key,
                   reducer=self.reducer_find_max_word)
        ]

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        yield (word, sum(counts))
    
    def mapper_word_count_one_key(self, word, counts):
        # send all the tuples to same reducer
        yield None, (counts, word)

    def reducer_find_max_word(self, _, count_word_pairs):
        # each item of word_count_pairs is a tuple (count, word),
        yield max(count_word_pairs)


if __name__ == '__main__':
    MRMostUsedWord.run()
""")

In [12]:
# This time is running on a big dataset
!python MrJob_job2.py --quiet ../datasets/shakespeare_all.txt

27801	"the"


In [13]:
!python MrJob_job2.py -r hadoop --quiet hdfs:///datasets/shakespeare_all.txt

27801	"the"
