In [2]:
import os
from langchain_codebase.codebase import *
import pandas as pd
import pickle

os.chdir("..")

### Basic Processing
Approaching the problem

In [None]:
BOOK_PATH = r"src\books\Physics9EM.pdf"
UNIT_VALUE_EXTRACTION_PROMPT = """
Following is a chunk of a student's book. Extract nothing but the value of the Unit number from it.
Chunk: {page_chunk}
"""

In [None]:
docs = load_single_pdf(BOOK_PATH)

Exception: Path doesn't exist. Provided path: src\books\Physics9EM.pdf

In [None]:
len(docs)

208

In [None]:
page_no = 0

def find_Unit_index_and_get_required_content(page_no):
    page_content = docs[page_no].page_content
    Unit_index = page_content.find('Unit ')

    if Unit_index > 20:
        s_index = Unit_index - 20
    else:
        s_index = Unit_index

    required_content = page_content[(s_index):(Unit_index + 20)]

    return Unit_index, required_content


In [None]:
# checking if all the pages have the string "Unit "

# for page_no in range(len(docs)):
#     Unit_index, content = find_Unit_index_and_get_required_content(page_no)
#     print(f"\nPage no: {page_no}, Unit index: {Unit_index}")

# All of them have

In [None]:
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

def load_ollama_model(model: str = 'gemma2:2b'):
    return OllamaLLM(model=model)

def get_response_from_model(llm, prompt):
    return llm.invoke(prompt)

def get_quiz_generator_prompt():
    prompt = PromptTemplate(
        input_variables = ['no_questions', 'book_content'],
        template = QUIZ_GENERATOR_PROMPT
    )
    return prompt

In [None]:
class DocumentProcessor:
    UNIT_WORD = "Unit "
    EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD = 20
    UNIT_VAL_ENDING_VALS = [":", "\n"]

    def extract_details_from_docs_as_df(self, docs, book_name):
        pages_with_no_unit_info = 0
        book_data = {
            'book_name': [],
            'page_no': [],
            'unit_no': [],
            'content': []
        }
        for doc in docs:
            page_no, unit_no, content = self.extract_page_no_unit_no_page_content_from_doc(doc)
            book_data['page_no'].append(page_no)
            book_data['unit_no'].append(unit_no)
            book_data['content'].append(content)
            book_data['book_name'].append(book_name)

            if not unit_no.isnumeric():
                pages_with_no_unit_info += 1

        print(f"Pages with no unit info: {pages_with_no_unit_info}")
        return pd.DataFrame(book_data)

    def extract_page_no_unit_no_page_content_from_doc(self, doc):
        page_no = doc.metadata['page']
        page_content = doc.page_content
        unit_no = self.extract_unit_no_from_content(page_content)

        return (page_no, unit_no, page_content)

    def extract_unit_no_from_content(self, content: str):
        print("Extracting unit no...")
        chunk = self.extract_chunk_containing_unit_value(content)

        unit_val_start_index = self.extract_unit_val_start_index(chunk)
        unit_val_end_index = self.extract_unit_val_end_index(chunk)    

        unit_val = chunk[unit_val_start_index:unit_val_end_index]
        return unit_val

    def extract_chunk_containing_unit_value(self, content: str):
        unit_word_index_in_content = content.find(self.UNIT_WORD)

        if self.is_found_invalid_index(unit_word_index_in_content):
            return ""
        
        chunk_start_index = self.get_chunk_start_index(unit_word_index_in_content)
        chunk_end_index = self.get_chunk_end_index(unit_word_index_in_content)

        chunk = content[chunk_start_index:chunk_end_index]
        return chunk

    def get_chunk_start_index(self, unit_word_index):
        if self.required_extra_chars_available_before_unit_word_index(unit_word_index):
            chunk_start_index = unit_word_index - self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD
        else:
            chunk_start_index = unit_word_index
        return chunk_start_index

    def required_extra_chars_available_before_unit_word_index(self, unit_word_index):
        if unit_word_index > self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD:
            return True
        else:
            return False

    def get_chunk_end_index(self, unit_word_index):
        unit_word_end_index = unit_word_index + len(self.UNIT_WORD)
        chunk_end_index = unit_word_end_index + self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD
        
        # catch: if required extra chars are not available after unit_word_end_index then while extracting chunk from the content, by default the chunk will end when the content will end. No matter if chunk_end_index is greater than the length of the content.
        return chunk_end_index


    def extract_unit_val_start_index(self, chunk: str):
        unit_word_index_in_chunk = chunk.find(self.UNIT_WORD)
        unit_val_start_index = unit_word_index_in_chunk + len(self.UNIT_WORD)
        return unit_val_start_index


    def extract_unit_val_end_index(self, chunk: str):
        for val in self.UNIT_VAL_ENDING_VALS:
            if val in chunk:
                unit_val_end_index = chunk.find(val)
                return unit_val_end_index
            else:
                pass
        
        unit_val_end_index = len(chunk)
        return unit_val_end_index


    def is_found_invalid_index(self, index):
        return True if (index == -1) else False

In [None]:
doc_processor = DocumentProcessor()



In [None]:
# # checking number of docs with no unit info and saving the data
# book_data = {
#     'page_no': [],
#     'unit_no': [],
#     'content': []
# }
# pages_with_no_unit_info = 0

# for doc in docs:
#     page_no, unit_no, page_content = doc_processor.extract_page_no_unit_no_page_content_from_doc(doc)

#     book_data['page_no'].append(page_no)
#     book_data['unit_no'].append(unit_no)
#     book_data['content'].append(content)
    
#     # print(unit_no)
#     if not unit_no.isnumeric():
#         pages_with_no_unit_info += 1

# print(f"Pages with no unit info: {pages_with_no_unit_info}")

Pages with no unit info: 5


In [None]:
doc_processor = DocumentProcessor()
    
def process_book_pdf_and_save_df(book_pdf_path):
    path_separator = detect_path_separator_in_path(book_pdf_path)
    book_name = book_pdf_path.split(path_separator)[-1].split(".")[0]
    book_save_path = book_name + ".csv"

    df = extract_details_from_book_pdf_path_as_df(book_pdf_path, book_name)
    df.to_csv(book_save_path, index=False)

    print(f"Book saved as: {book_save_path}")

def extract_details_from_book_pdf_path_as_df(book_pdf_path, book_name: str):
    if is_valid_pdf_path(book_pdf_path):
        docs = load_single_pdf(book_pdf_path)
        df = doc_processor.extract_details_from_docs_as_df(docs, book_name)
        return df

def detect_path_separator_in_path(path):
    if '\\' in path:
        path_separator = "\\"
    else:
        path_separator = '/'

    return path_separator
    

In [None]:
process_book_pdf_and_save_df(r"src\books\English 9.pdf")

NameError: name 'process_book_pdf_and_save_df' is not defined