### Documentação MRJob -> https://mrjob.readthedocs.io/en/latest/index.html

### Job MapReduce para contar palavras

In [3]:
%%writefile mr_word_count.py
from mrjob.job import MRJob

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)
        
if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting mr_word_count.py


In [4]:
!python mr_word_count.py file01.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_count.jeanalves.20201018.214917.376001
Running step 1 of 1...
job output is in /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_count.jeanalves.20201018.214917.376001/output
Streaming final output from /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_count.jeanalves.20201018.214917.376001/output...
"words"	123
"lines"	11
"chars"	824
Removing temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_count.jeanalves.20201018.214917.376001...


### Job MapReduce para retorna a palavra que mais aparece no texto

In [5]:
%%writefile mr_word_max.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def mapper_get_words(self, _, line):
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_max_word(self, _, word_count_pairs):
        yield max(word_count_pairs)

    def steps(self):
        return [
            MRStep(
                   mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words
                  ),
            MRStep(reducer=self.reducer_find_max_word)
        ]


if __name__ == '__main__':
    MRMostUsedWord.run()

Overwriting mr_word_max.py


In [6]:
!python mr_word_max.py -r local file01.txt

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_max.jeanalves.20201018.214924.787417
Running step 1 of 2...
Running step 2 of 2...
job output is in /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_max.jeanalves.20201018.214924.787417/output
Streaming final output from /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_max.jeanalves.20201018.214924.787417/output...
8	"de"
Removing temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_max.jeanalves.20201018.214924.787417...


### Como executar o MRJob
#### python sqlite_job.py -r local  --database=/etc/my_db.sqlite3
### python sqlite_job.py -r hadoop --database=/etc/my_db.sqlite3
### python sqlite_job.py -r hadoop --database=hdfs://my_dir/my_db.sqlite3
### python sqlite_job.py -r emr    --database=/etc/my_db.sqlite3
### python sqlite_job.py -r emr    --database=s3://my_bucket/my_db.sqlite3

## Exercicio 01 
### Faça um filtro que tire a palavra "de" e retorne a palavra que mais aparece no texto

In [11]:
%%writefile mr_word_filter.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def mapper_get_words(self, _, line):
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_max_word(self, _, word_count_pairs):
        yield max(word_count_pairs)

    def steps(self):
        return [
            MRStep(mapper_pre_filter='grep -v "de"',
                   mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words
                  ),
            MRStep(reducer=self.reducer_find_max_word)
        ]


if __name__ == '__main__':
    MRMostUsedWord.run()

Overwriting mr_word_filter.py


In [12]:
!python mr_word_filter.py -r local file01.txt

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_filter.jeanalves.20201018.215349.579366
Running step 1 of 2...
Running step 2 of 2...
job output is in /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_filter.jeanalves.20201018.215349.579366/output
Streaming final output from /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_filter.jeanalves.20201018.215349.579366/output...
1	"wick"
Removing temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_filter.jeanalves.20201018.215349.579366...


## Exercicio 02 
### Crie um job que retorne a maior palavra

In [34]:
%%writefile mr_word_len.py
# -*- coding: utf-8 -*-
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import TextValueProtocol
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):
    
    OUTPUT_PROTOCOL = TextValueProtocol

    def mapper_get_words(self, _, line):
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        yield (word, len(word))

    def reducer_count_words(self, word, counts):
        yield None, (len(word), word)

    def reducer_find_max_word(self, _, word_count_pairs):
        yield max(word_count_pairs)

    def steps(self):
        return [
            MRStep(
                   mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words
                  ),
            MRStep(reducer=self.reducer_find_max_word)
        ]


if __name__ == '__main__':
    MRMostUsedWord.run()

Overwriting mr_word_len.py


In [36]:
!python mr_word_len.py -r local file01.txt > file02.txt

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_len.jeanalves.20201018.220703.856670
Running step 1 of 2...
Running step 2 of 2...
job output is in /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_len.jeanalves.20201018.220703.856670/output
Streaming final output from /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_len.jeanalves.20201018.220703.856670/output...
Removing temp directory /var/folders/11/8bxpvmg17yz34h8m15dhzk_r0000gp/T/mr_word_len.jeanalves.20201018.220703.856670...
