# https://mrjob.readthedocs.io/en/latest/guides/writing-mrjobs.html

In [81]:
%%file rc.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
DATA_RE = re.compile(r"[\w.-]+")


class MRProb2_3(MRJob):


    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_sepW_setosa,
                   reducer=self.reducer_get_avg)
        ]

    def mapper_get_sepW_setosa(self, _, line):
        # yield each petal width
        data = DATA_RE.findall(line)
        if "Iris-setosa" in data:
            sep_W = float(data[1])
            yield ("sepal width", sep_W)

    def reducer_get_avg(self, key, values):
        # get max of the petal widths
        size, total = 0, 0
        for val in values:
            size += 1
            total += val
        yield ("setosa sepal width avg", round(total,1) / size)
if __name__ == '__main__':
    MRProb2_3.run()


Overwriting rc.py


In [82]:
!python rc.py iris.data

"setosa sepal width avg"	3.418


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\rossm\AppData\Local\Temp\rc.rossm.20210220.162843.353679
Running step 1 of 1...
job output is in C:\Users\rossm\AppData\Local\Temp\rc.rossm.20210220.162843.353679\output
Streaming final output from C:\Users\rossm\AppData\Local\Temp\rc.rossm.20210220.162843.353679\output...
Removing temp directory C:\Users\rossm\AppData\Local\Temp\rc.rossm.20210220.162843.353679...


In [73]:
%%file mr_word_count.py 

from mrjob.job import MRJob

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting mr_word_count.py


In [83]:
!python mr_word_count.py iris.data hamlet.txt

"chars"	191667
"lines"	4609
"words"	32105


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\rossm\AppData\Local\Temp\mr_word_count.rossm.20210220.163703.408679
Running step 1 of 1...
job output is in C:\Users\rossm\AppData\Local\Temp\mr_word_count.rossm.20210220.163703.408679\output
Streaming final output from C:\Users\rossm\AppData\Local\Temp\mr_word_count.rossm.20210220.163703.408679\output...
Removing temp directory C:\Users\rossm\AppData\Local\Temp\mr_word_count.rossm.20210220.163703.408679...


In [84]:
%%file mr_word_common.py 
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_max_word)
        ]

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        # optimization: sum the words we've seen so far
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is so we can easily use Python's max() function.
        yield None, (sum(counts), word)

    # discard the key; it is just None
    def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        yield max(word_count_pairs)


if __name__ == '__main__':
    MRMostUsedWord.run()

Writing mr_word_common.py


In [85]:
!python mr_word_common.py iris.data hamlet.txt

1086	"the"


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\rossm\AppData\Local\Temp\mr_word_common.rossm.20210220.164228.179195
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\rossm\AppData\Local\Temp\mr_word_common.rossm.20210220.164228.179195\output
Streaming final output from C:\Users\rossm\AppData\Local\Temp\mr_word_common.rossm.20210220.164228.179195\output...
Removing temp directory C:\Users\rossm\AppData\Local\Temp\mr_word_common.rossm.20210220.164228.179195...
