## Loading the PDF directory

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

In [2]:
def load_documents():
    document_loader = PyPDFDirectoryLoader("data")
    return document_loader.load()

In [3]:
documents = load_documents()
print(documents[0])

page_content='SYMMETRY9
Look around you  — you may find many objects that catch your 
attention. Some such things are shown below:
Butterfly
Flower  
Pinwheel  Rangoli  
There is something beautiful about the pictures above. 
The flower looks the same from many different angles. What 
about the butterfly? No doubt, the colours are very attractive. But 
what else about the butterfly appeals to you?
In these pictures, it appears that some parts of the figure are 
repeated and these repetitions seem to occur in a definite pattern. 
Can you see what repeats in the beautiful rangoli figure? In the 
Chapter 9_Symmetry.indd   217 13-08-2024   17:05:22
Reprint 2025-26' metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-08-13T17:05:19+05:30', 'moddate': '2025-04-01T14:54:57+05:30', 'trapped': '/False', 'source': 'data/fegp109.pdf', 'total_pages': 25, 'page': 0, 'page_label': '217'}


## Splitting the pages into smaller Chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

In [15]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 7000,
        chunk_overlap = 150,
        length_function = len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

In [16]:
documents = load_documents()
chunks = split_documents(documents)
print(chunks[0])

page_content='SYMMETRY9
Look around you  — you may find many objects that catch your 
attention. Some such things are shown below:
Butterfly
Flower  
Pinwheel  Rangoli  
There is something beautiful about the pictures above. 
The flower looks the same from many different angles. What 
about the butterfly? No doubt, the colours are very attractive. But 
what else about the butterfly appeals to you?
In these pictures, it appears that some parts of the figure are 
repeated and these repetitions seem to occur in a definite pattern. 
Can you see what repeats in the beautiful rangoli figure? In the 
Chapter 9_Symmetry.indd   217 13-08-2024   17:05:22
Reprint 2025-26' metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-08-13T17:05:19+05:30', 'moddate': '2025-04-01T14:54:57+05:30', 'trapped': '/False', 'source': 'data/fegp109.pdf', 'total_pages': 25, 'page': 0, 'page_label': '217'}


## Custom Indexing of the chunks

In [17]:
def define_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0
    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page_label")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index +=1
        else:
            current_chunk_index = 0
        
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id
    return chunks

### Create Embedding Function (illustrative)

#### To create the database and to extract data by querying the database.

This is saved as a python function in .py file for reuse. Here, we use `OllamaEmbeddings`

In [15]:
# from langchain_community.embeddings.bedrock import BedrockEmbeddings

In [16]:
# def get_embedding_function():
#     embeddings = BedrockEmbeddings(
#         credentials_profile_name="default", region_name="us-east-1"
#     )
#     return embeddings

## Creating the vector Database and Enabling Auto-addition of a new file

When a new file is added to the "data" directory, the program will detect this based on the index and add them without complete updation.

In [18]:
from get_embedding_function import get_embedding_function
from langchain_chroma.vectorstores import Chroma

In [49]:
# !ollama pull nomic-embed-text #embedding LLM - it has a large context window and performs well
# !ollama pull mistral #LLM for text generation (7B parameters)
# !ollama pull gemma3:1b #LLM for text generation (1B paramters)

In [None]:
# !ollama list

In [19]:
CHROMA_PATH = "chroma_new"
def add_to_chromadb(chunks: list[Document]):
    db = Chroma(
        collection_name= "chunks", persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    chunks_with_ids = define_chunk_ids(chunks)

    existing_chunks = db.get(include =[])
    existing_ids = set(existing_chunks["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
        
    if len(new_chunks):
        print(f"New {len(new_chunks)} documents added to the DB")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids = new_chunk_ids)
        # db.persist()
        print("chunk embedded!")
    else:
        print("No documents to add!")



In [20]:
import argparse
import os
import shutil

In [21]:
parser = argparse.ArgumentParser()
parser.add_argument("--reset", action="store_true", help="Reset the database.")
args = parser.parse_known_args()
# if args.reset:
#     print("✨ Clearing Database")
#     clear_database()
add_to_chromadb(chunks)

Number of existing documents in DB: 0
New 284 documents added to the DB
chunk embedded!


In [25]:
from langchain_chroma.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_ollama.llms import OllamaLLM

from get_embedding_function import get_embedding_function

In [26]:
CHROMA_PATH = "chroma_new"

sys_instructions = SystemMessagePromptTemplate.from_template("You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students."  
"You should use the data in the vector database to find the most relevant portions to explain them the concepts."
"Before starting the response, translate the students' question into a title. Then, solve the {question} and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader. Include all steps to reach the answer without skipping any. Use standard approaches."
"After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student."
"Finally, provide a third example without solution that covers all the topics from the explanation for the student to solve. Provide the answer as a hint. Ensure all the mathematical equations are LaTeX-formatted throughout the response.")

rag_context = HumanMessagePromptTemplate.from_template("Answer the question based on the following context: {context} Question: {question}")


In [27]:
chat_prompt = ChatPromptTemplate.from_messages([sys_instructions, rag_context])
print(chat_prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template="You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the data in the vector database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the {question} and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader. Include all steps to reach the answer without skipping any. Use standard approaches.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the stu

In [28]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(collection_name="chunks", persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)
    # print(results)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    # prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    # prompt = prompt_template.format(context=context_text, question=query_text)
    prompt = chat_prompt.format(context=context_text, question=query_text)
    print(prompt)

    # model = OllamaLLM(model="mistral")
    model = OllamaLLM(model="gemma3:1b")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    # formatted_response = f"Response: {response_text}"
    # print(formatted_response)
    return formatted_response

In [41]:
# generated with mistral
query_text= input("User: ")
print(query_text)

Help me draw a square of side 5 cm using a measuring scale, a protractor and a compass.


In [42]:
# generated with mistral
response = query_rag(query_text)
print("Response: ", response)

System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the documents in the database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title and provide explanation from the beginning. Format the responses into step-by-step explanations.You should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student.Finally, provide a question that covers all the topics from the explanation for the student to solve. Provide the answer as a hint. Ensure all the mathematical equations are LaTeX-formatted.
Human: Answer the question based on the following context: Playing with Constructions
195
8.3 Constructing Squares and Rectangles
Now, let us start constructing squ

In [43]:
# generated with mistral
query_text= input("User: ")
print(query_text)

Is 24256 divisible by 8?


In [44]:
# generated with mistral
response = query_rag(query_text)
print(response)

System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the documents in the database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title and provide explanation from the beginning. Format the responses into step-by-step explanations.You should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student.Finally, provide a question that covers all the topics from the explanation for the student to solve. Provide the answer as a hint. Ensure all the mathematical equations are LaTeX-formatted.
Human: Answer the question based on the following context: Prime Time 
125
Divisibility by 8
Interestingly, even checking for divisibility by 8 can be simplified. 


In [48]:
# generated with mistral
query_text= input("User: ")
print(query_text)
response = query_rag(query_text)
print(response)

Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$
System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the documents in the database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$ and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student.Finally, provide a third example without solution that covers all the topics from the explanation

In [54]:
# generated with gemma3:1b
query_text= input("User: ")
print(query_text)
response = query_rag(query_text)
print(response)

Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$
System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the documents in the database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$ and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student.Finally, provide a third example without solution that covers all the topics from the explanation

In [63]:
# generated with gemma3:1b updated SI
query_text= input("User: ")
print(query_text)
response = query_rag(query_text)
print(response)

Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$
System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the data in the vector database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the Can you help me arrange these fractions in descending order: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$ and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader. Include all steps to reach the answer without skipping any. Use standard approaches.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in the student.Finally, p

In [64]:
# generated with gemma3:1b updated SI 2
query_text= input("User: ")
print(query_text)
response = query_rag(query_text)
print(response)

Can you help me arrange these fractions in descending order using LCM: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$
System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the data in the vector database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the Can you help me arrange these fractions in descending order using LCM: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$ and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader. Include all steps to reach the answer without skipping any. Use standard approaches.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in th

In [29]:
# generated with gemma3:1b updated SI 3 and updated chunk size to 8000 (initial 800) and chunk_overlap to 150 (initial 150)
query_text= input("User: ")
print(query_text)
response = query_rag(query_text)
print(response)

Can you help me arrange these fractions in descending order using LCM: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$
System: You are a middle school math teacher. Your task is to respond to 6th graders' doubts from their curriculum. You should have an encouraging and polite attitude to the students.You should use the data in the vector database to find the most relevant portions to explain them the concepts.Before starting the response, translate the students' question into a title. Then, solve the Can you help me arrange these fractions in descending order using LCM: $\frac{5}{3}, \frac{2}{5}, \frac{3}{7}$ and format it into step-by-step explanations abiding to pedagogical standards for a 6th grader. Include all steps to reach the answer without skipping any. Use standard approaches.After the explanation for the question, you should give two different examples of different difficulty levels. The first example should be a direct and easy one, and the second should invoke some reasoning in th