In [207]:
from langchain_codebase.codebase import *
import pandas as pd
import pickle

### Basic Processing
Approaching the problem

In [70]:
BOOK_PATH = r"src\books\Physics9EM.pdf"
UNIT_VALUE_EXTRACTION_PROMPT = """
Following is a chunk of a student's book. Extract nothing but the value of the Unit number from it.
Chunk: {page_chunk}
"""

In [3]:
docs = load_single_pdf(BOOK_PATH)

In [20]:
len(docs)

208

In [45]:
page_no = 0

def find_Unit_index_and_get_required_content(page_no):
    page_content = docs[page_no].page_content
    Unit_index = page_content.find('Unit ')

    if Unit_index > 20:
        s_index = Unit_index - 20
    else:
        s_index = Unit_index

    required_content = page_content[(s_index):(Unit_index + 20)]

    return Unit_index, required_content


In [56]:
# checking if all the pages have the string "Unit "

# for page_no in range(len(docs)):
#     Unit_index, content = find_Unit_index_and_get_required_content(page_no)
#     print(f"\nPage no: {page_no}, Unit index: {Unit_index}")

# All of them have

In [165]:
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

def load_ollama_model(model: str = 'gemma2:2b'):
    return OllamaLLM(model=model)

def get_response_from_model(llm, prompt):
    return llm.invoke(prompt)

def get_quiz_generator_prompt():
    prompt = PromptTemplate(
        input_variables = ['no_questions', 'book_content'],
        template = QUIZ_GENERATOR_PROMPT
    )
    return prompt

In [194]:
class DocumentProcessor:
    UNIT_WORD = "Unit "
    EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD = 20
    UNIT_VAL_ENDING_VALS = [":", "\n"]

    def extract_details_from_docs_as_df(self, docs, book_name):
        pages_with_no_unit_info = 0
        book_data = {
            'book_name': [],
            'page_no': [],
            'unit_no': [],
            'content': []
        }
        for doc in docs:
            page_no, unit_no, content = self.extract_page_no_unit_no_page_content_from_doc(doc)
            book_data['page_no'].append(page_no)
            book_data['unit_no'].append(unit_no)
            book_data['content'].append(content)
            book_data['book_name'].append(book_name)

            if not unit_no.isnumeric():
                pages_with_no_unit_info += 1

        print(f"Pages with no unit info: {pages_with_no_unit_info}")
        return pd.DataFrame(book_data)

    def extract_page_no_unit_no_page_content_from_doc(self, doc):
        page_no = doc.metadata['page']
        page_content = doc.page_content
        unit_no = self.extract_unit_no_from_content(page_content)

        return (page_no, unit_no, page_content)

    def extract_unit_no_from_content(self, content: str):
        print("Extracting unit no...")
        chunk = self.extract_chunk_containing_unit_value(content)

        unit_val_start_index = self.extract_unit_val_start_index(chunk)
        unit_val_end_index = self.extract_unit_val_end_index(chunk)    

        unit_val = chunk[unit_val_start_index:unit_val_end_index]
        return unit_val

    def extract_chunk_containing_unit_value(self, content: str):
        unit_word_index_in_content = content.find(self.UNIT_WORD)

        if self.is_found_invalid_index(unit_word_index_in_content):
            return ""
        
        chunk_start_index = self.get_chunk_start_index(unit_word_index_in_content)
        chunk_end_index = self.get_chunk_end_index(unit_word_index_in_content)

        chunk = content[chunk_start_index:chunk_end_index]
        return chunk

    def get_chunk_start_index(self, unit_word_index):
        if self.required_extra_chars_available_before_unit_word_index(unit_word_index):
            chunk_start_index = unit_word_index - self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD
        else:
            chunk_start_index = unit_word_index
        return chunk_start_index

    def required_extra_chars_available_before_unit_word_index(self, unit_word_index):
        if unit_word_index > self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD:
            return True
        else:
            return False

    def get_chunk_end_index(self, unit_word_index):
        unit_word_end_index = unit_word_index + len(self.UNIT_WORD)
        chunk_end_index = unit_word_end_index + self.EXTRA_CHARS_TO_EXTRACT_AROUND_UNIT_WORD
        
        # catch: if required extra chars are not available after unit_word_end_index then while extracting chunk from the content, by default the chunk will end when the content will end. No matter if chunk_end_index is greater than the length of the content.
        return chunk_end_index


    def extract_unit_val_start_index(self, chunk: str):
        unit_word_index_in_chunk = chunk.find(self.UNIT_WORD)
        unit_val_start_index = unit_word_index_in_chunk + len(self.UNIT_WORD)
        return unit_val_start_index


    def extract_unit_val_end_index(self, chunk: str):
        for val in self.UNIT_VAL_ENDING_VALS:
            if val in chunk:
                unit_val_end_index = chunk.find(val)
                return unit_val_end_index
            else:
                pass
        
        unit_val_end_index = len(chunk)
        return unit_val_end_index


    def is_found_invalid_index(self, index):
        return True if (index == -1) else False

In [190]:
doc_processor = DocumentProcessor()



In [137]:
# # checking number of docs with no unit info and saving the data
# book_data = {
#     'page_no': [],
#     'unit_no': [],
#     'content': []
# }
# pages_with_no_unit_info = 0

# for doc in docs:
#     page_no, unit_no, page_content = doc_processor.extract_page_no_unit_no_page_content_from_doc(doc)

#     book_data['page_no'].append(page_no)
#     book_data['unit_no'].append(unit_no)
#     book_data['content'].append(content)
    
#     # print(unit_no)
#     if not unit_no.isnumeric():
#         pages_with_no_unit_info += 1

# print(f"Pages with no unit info: {pages_with_no_unit_info}")

Pages with no unit info: 5


In [200]:
doc_processor = DocumentProcessor()
    
def process_book_pdf_and_save_df(book_pdf_path):
    path_separator = detect_path_separator_in_path(book_pdf_path)
    book_name = book_pdf_path.split(path_separator)[-1].split(".")[0]
    book_save_path = book_name + ".csv"

    df = extract_details_from_book_pdf_path_as_df(book_pdf_path, book_name)
    df.to_csv(book_save_path, index=False)

    print(f"Book saved as: {book_save_path}")

def extract_details_from_book_pdf_path_as_df(book_pdf_path, book_name: str):
    if is_valid_pdf_path(book_pdf_path):
        docs = load_single_pdf(book_pdf_path)
        df = doc_processor.extract_details_from_docs_as_df(docs, book_name)
        return df

def detect_path_separator_in_path(path):
    if '\\' in path:
        path_separator = "\\"
    else:
        path_separator = '/'

    return path_separator
    

In [201]:
process_book_pdf_and_save_df(r"src\books\English 9.pdf")

Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, page content and unit no...
Extracting unit no...
Extracting page no, 

### Magic Happens here
Using the code

In [192]:
doc_processor = DocumentProcessor()
df = extract_details_from_book_pdf_path_as_df(r"src\books\Physics9EM.pdf")

In [142]:
df.head()

Unnamed: 0,page_no,unit_no,content
0,0,1,describe the crucial role of Physics in Scien...
1,1,1,2\n Unit 1: Physical Quantities and Measuremen...
2,2,1,experimentations is called Science . The wor...
3,3,1,QUICK QUIZ\n1. Why do we study physics?\n2. ...
4,4,1,Volume:\n31 cm\n1 mL\n1 cm 10 cm\n= 1 dmBASE Q...


### Tokenizing and other processing

In [153]:
content = df['content'][30]
print(len(content))

1674


In [152]:
import tiktoken
encoder = tiktoken.get_encoding('gpt2')
tokens = encoder.encode(content)
print(len(tokens))

530


### Generating Quiz

In [164]:
QUIZ_GENERATOR_PROMPT = """
You will receive the contents of a particular chapter of a student's book. Your job is to create {no_questions} multiple choice question answers using those contents and provide them in the form of python code where:
1- All are questions are inside a list.
2- Each question is a dictionary of values.
3- The keys of the question dictionary should be:
question (representing question as str)
choices (list of choices representing each choice as str)
correct_choice (the index of correct choice i.e. the index of correct choice form the choices list.)

Note: Strictly follow the rules and don't provide anything else than the asked. Only provide a clean list ready to process further.

contents: {book_content}
"""

In [166]:
prompt = get_quiz_generator_prompt()
llm = load_ollama_model()

In [167]:
content = df['content'][30]
response = llm.invoke(prompt.format(no_questions=4, book_content=content))
print(response)

```python
[
    {
        "question": "What is an example of random motion in nature?",
        "choices": ["a) The spinning motion of a top", "b) Brownian motion", "c) Circular motion", "d) Rotational motion"],
        "correct_choice": 1
    },
    {
        "question": "What type of motion does the Earth exhibit?",
        "choices": ["a)  Random motion", "b)  Circular motion", "c) Rotatory motion", "d) Brownian motion"],
        "correct_choice": 2
    },
    {
        "question": "Which statement correctly describes rotatory motion?",
        "choices": ["a) The motion of a body around an axis", "b)  The irregular motion of objects like insects and birds", "c) Random motion of particles in gases or liquids", "d) All the above"],
        "correct_choice": 0 
    },
    {
        "question": "What is the difference between circular motion and rotatory motion?",
        "choices": ["a)  Circular motion describes an object moving around a fixed point, while rotatory motion describes a

In [169]:
print(type(response))

<class 'str'>


In [185]:
import ast

def evaluate_generated_questions(response: str):
    response = response.replace("python", '')
    response = response.replace("```", '')
    response = ast.literal_eval(response)
    return response

### Processing other books

In [204]:
docs = load_single_pdf(r"src\books\English 9.pdf")

In [205]:
docs[0]

Document(metadata={'source': 'src\\books\\English 9.pdf', 'page': 0}, page_content='')