# Get Data 
> A notebook for getting data from official sources and unzipping them to machine readable formats

- toc: true 
- badges: false
- comments: true
- categories: [jupyter]
- author: Nirant Kasliwal and Meghana Bhange
<!-- - image: images/chart-preview.png -->

In [None]:
# hide
# !pip install requests
# !pip install pydantic
# !pip install tqdm
# !pip install pdfminer.six

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# hide_input
import json
from pathlib import Path
from typing import List, Union

import requests
from pydantic import BaseModel
from tqdm.notebook import tqdm

from textbook import Book, Chapter

In [None]:
Path.pdfls = lambda x: [x for x in list(x.iterdir()) if x.suffix == ".pdf"]
Path.ls = lambda x: list(x.iterdir())

### Get List of Books and Download Links

In [None]:
# collapse-hide
sheet_name = "History"
books_list = (
    f"https://api.steinhq.com/v1/storages/5fd49704f62b6004b3eb63a3/{sheet_name}"
)
r = requests.get(books_list)

In [None]:
# collapse-hide
ncert_history_books = [Book(**x) for x in json.loads(r.text)]

## Download and Extract all Books

In [None]:
# collapse-show
for book in tqdm(ncert_history_books):
    book.download("../data/raw")
    book.unzip("../data/extract")

In [None]:
single_book = ncert_history_books[0]

In [None]:
pdf_files = []
for folder in single_book.extract_to_path.ls():
    pdf_files.extend(folder.pdfls())
pdf_files.sort()
pdf_files = [
    file for file in pdf_files if file.stem[-2:].isdigit()
]  # keep the chapter files, nothing else
pdf_files

In [None]:
# map chapter numbers
from io import StringIO
from typing import List

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def pdf_to_text(file: Path, output_io_wrapper: Object) -> List[str]:
    """
    Converts the pdf to text using pdfminer.six Using PDFParser
    to fetch PDF objects from a file stream.

    This is then passed to PDF document to cooperate with a
    PDF parser in order to dynamically import the data as
    processing goes

    ResourceManager facilitates reuse of shared resources
    such as fonts and images so that large objects are not
    allocated multiple times.
    
    Used line_margin=0.7 because anything below that was 
    considering a paragraph break as a different text blob(bounding box?)
    """

    with open(file, "rb") as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        resource_manager = PDFResourceManager()
        test_converter = TextConverter(
            resource_manager, output_io_wrapper, laparams=LAParams(line_margin=0.7)
        )
        interpreter = PDFPageInterpreter(resource_manager, test_converter)
        # Processor for the content of a PDF page

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    return output_string.getvalue()


for file in pdf_files:
    """
    output_io_wrapper is StringIO because TextConverter expect
    StringIOWrapper/TextIOWrapper or similar object as an input.
    This can be replaced by TextIOwrapper when we want to export the 
    output directly to the file 
    """
    output_io_wrapper = StringIO()
    plain_text = pdf_to_text(file, output_io_wrapper)
    print(plain_text)

In [None]:
# ncert_history_books[1].download()
# ncert_history_books[1].unzip()