# MapReduce
The following Jupyter Notebook will:
1. Download the PDF file from my google drive (using code from my [Medium Article](https://towardsdatascience.com/different-ways-to-connect-google-drive-to-a-google-colab-notebook-pt-1-de03433d2f7a))
2. Select a chapter from the PDF file.
3. Select the pages from 22 to 32 and save them in file1.txt
4. Select the pages from 94 to 104 and save them in file2.txt
5. Create a map.py file for the Map in **Map**Reduce
6. Create a reduce.py file for the Reduce in Map**Reduce**
7. Create a map_english.py file for mapping non-english words.
  - Using NLTK - A natural Language library
8. Run MapReduce for file1.txt and save it on a new output1.txt
9. Run MapEmglishReduce for file2.txt and save it on a new output2.txt
10. Combine both files and create a pdf.

## Import Libraries

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import PyPDF2
from fpdf import FPDF

## 1. Dowload PDF from Google Drive

In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '{FILE_ID}'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('Harry_Potter.pdf')

## 2. Select Chapter #3

In [5]:
def select_chapter(pdf_file, chapter_start_page,chapter_end_page):
    """Selects a chapter from a PDF.

    Args:
        pdf_file: name of the PDF file.
        chapter_start_page: First page of the chapter
        chapter_end_page: last page of the chapter

    Returns:
        A PDF object containing the selected chapter.
    """

    with open(pdf_file, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)

        selected_chapter_pdf = PyPDF2.PdfWriter()

        for page_number in range(chapter_start_page, chapter_end_page + 1):
            page = pdf_reader.pages[page_number]
            selected_chapter_pdf.add_page(page)

    return selected_chapter_pdf

In [6]:
pdf_file = 'Harry_Potter.pdf'
chapter_start_page = 1085
chapter_end_page = 1806

selected_chapter_pdf = select_chapter(pdf_file, chapter_start_page,chapter_end_page)

with open('chapter3.pdf', 'wb') as f:
    selected_chapter_pdf.write(f)

## 3.  Select from page 22 to 32 and create file1.txt

In [7]:
start_page = 22
end_page = 32

with open('file1.txt', 'w') as f:
    reader =  PyPDF2.PdfReader("chapter3.pdf")
    for page_number in range(start_page, end_page+1):
            page = reader.pages[page_number]
            f.write(page.extract_text())

## 4.  Select from page 94 to 104 and create file2.txt

In [8]:
start_page = 94
end_page = 104

with open('file2.txt', 'w') as f:
    reader =  PyPDF2.PdfReader("chapter3.pdf")
    for page_number in range(start_page, end_page+1):
            page = reader.pages[page_number]
            f.write(page.extract_text())

## 5. Map python file creation

In [9]:
%%file mapper.py

import sys
import re

for line in sys.stdin:
    line = line.strip().lower()
    words = re.findall(r'\w+\b', line)
    for word in words:
        print (f'{word}\t 1')

Writing mapper.py


## 6. Reduce python file creation

In [10]:
%%file reducer.py

from operator import itemgetter
import sys

with open(sys.argv[1], 'w') as f:
  f.write(f"{sys.argv[2]}\n")

current_word = None
current_count = 0
word = None

for line in sys.stdin:

    word, count = line.split('\t', 1)

    try:
        count = int(count)
    except ValueError:

        continue

    if current_word == word:
        current_count += count
    else:
        if current_word:
            with open(sys.argv[1], 'a') as f:
              f.write(f'{current_word}\t{current_count}\n')
        current_count = count
        current_word = word


if current_word == word:
  with open(sys.argv[1], 'a') as f:
            f.write(f'{current_word}\t{current_count}')

Writing reducer.py


## 7. Mapper Non English file creation


In [11]:
## Same as mapper.py but adding the nltk library for non english words

%%file mapper_english.py

import subprocess
import sys
import re
import nltk
from nltk.corpus import words


nltk.download('words')

english_words = set(words.words())


for line in sys.stdin:
    line = line.strip().lower()
    words = re.findall(r'\w+\b', line)

    for word in words:
        if word not in english_words:
          print (f'{word}\t 1')

Writing mapper_english.py


### Making the files executable!

In [12]:
!chmod +x mapper.py

In [13]:
!chmod +x reducer.py

In [14]:
!chmod +x mapper_english.py

## 8. Running MapReducer for file1.txt



In [15]:
!cat file1.txt | ./mapper.py | sort | ./reducer.py output1.txt MapReduce_All| sort

## 9. Running MapReducer for file2.txt

In [16]:
!cat file2.txt | ./mapper_english.py | sort | ./reducer.py output2.txt MapReduce_Non_English_NLTK| sort

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


## 10. Saving as a PDF


In [17]:
pdf = FPDF()
pdf.add_page()

pdf.set_font("Arial", size = 12)

f = open("output1.txt", "r")
f2 = open("output2.txt", "r")

pdf.cell(200, 15, txt = 'MapReduce File 1', ln = 1, align = 'C')
for x in f:
  pdf.cell(200, 10, txt = x, ln = 1, align = 'L')

pdf.cell(200, 15, txt = 'MapReduce File 2 - Non English', ln = 1, align = 'C')

for x in f2:
  pdf.cell(200, 10, txt = x, ln = 1, align = 'L')

pdf.output("Final_output.pdf")

''