In [11]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# langchain import 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
# from langchain_core.documents import Document

# vector store  
from langchain_community.vectorstores import Chroma

# utility import
import pandas as pd
import numpy as np
from typing import List


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
file_path = 'data/material.pdf'
loader = PyPDFLoader(file_path)
documents = loader.load()

In [4]:
for doc in documents:
    print(f"Page content snippet: {doc.page_content[:100]}...")
    print(f"Metadata: {doc.metadata}")

Page content snippet: CHAPTER 
Representing and Manipulating 
Information 
ZA Information Storage 34 
22 Integer Represent...
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260123210259', 'source': 'data/material.pdf', 'total_pages': 110, 'page': 0, 'page_label': '1'}
Page content snippet: 32 Chapter 2 Representing and Manipulating Information 
Mos computers store and process information ...
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260123210259', 'source': 'data/material.pdf', 'total_pages': 110, 'page': 1, 'page_label': '2'}
Page content snippet: Chapter 2. Representing and Manipulating Information 
The computer might not generate the expected r...
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260123210259', 'source': 'data/material.pdf', 'total_pages': 110, 'page': 2, 'page_label': '3'}
Page content snippet: 34 Chapter 2 Representing and Manipulating Information 
Aside How to read this chapter 

## Document spliting

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len ,
    separators=["\n\n","\n","."," ","   ",""]
)
chunks = text_splitter.split_documents(documents)


In [6]:


print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nChunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 560 chunks from 110 documents

Chunk example:
Content: CHAPTER 
Representing and Manipulating 
Information 
ZA Information Storage 34 
22 Integer Representations 59 
23 Integer Arithmetic 84 
2.4 Floating ...
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260123210259', 'source': 'data/material.pdf', 'total_pages': 110, 'page': 0, 'page_label': '1'}


## Embedding Models

In [12]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [13]:
sample_text = "Integere representation"
embedding = OpenAIEmbeddings()
embedding

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001E92BEA2510>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001E92BEA2E40>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
# vector = embedding.embed_query(sample_text)
# vector

## Init Vector Store

In [15]:
## Create a chromaDb vectore store

persist_dir = './chroma_db'

## Create ChromaDb with opernAIEmbedding 

vectorStore = Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_dir,
    collection_name='rag_collection',
)

print(f"Vector store created with {vectorStore._collection.count()} vectors")
print(f"Persisted to : {persist_dir}")


Vector store created with 1120 vectors
Persisted to : ./chroma_db


## Test the similarity search

In [16]:
query = "What is binary "
similar_doc = vectorStore.similarity_search(query,k=3)

In [17]:
similar_doc

[Document(metadata={'page': 1, 'total_pages': 110, 'producer': 'PDFium', 'creator': 'PDFium', 'source': 'data/material.pdf', 'page_label': '2', 'creationdate': 'D:20260123210259'}, page_content='32 Chapter 2 Representing and Manipulating Information \nMos computers store and process information represented as two-valued \nsignals. These lowly binary digits, or bits, form the basis of the digital revo- \nlution. The familiar decimal, or base-10, representation has been in use for over \n1,000 years, having been developed in India, improved by Arab mathematicians in \nthe 12th century, and brought to the West in the 13th century by the Italian mathe-'),
 Document(metadata={'page': 1, 'page_label': '2', 'creator': 'PDFium', 'source': 'data/material.pdf', 'total_pages': 110, 'creationdate': 'D:20260123210259', 'producer': 'PDFium'}, page_content='32 Chapter 2 Representing and Manipulating Information \nMos computers store and process information represented as two-valued \nsignals. These l

In [18]:
results =vectorStore.similarity_search_with_score(query,k=5)
results


[(Document(metadata={'total_pages': 110, 'creationdate': 'D:20260123210259', 'producer': 'PDFium', 'creator': 'PDFium', 'source': 'data/material.pdf', 'page_label': '2', 'page': 1}, page_content='32 Chapter 2 Representing and Manipulating Information \nMos computers store and process information represented as two-valued \nsignals. These lowly binary digits, or bits, form the basis of the digital revo- \nlution. The familiar decimal, or base-10, representation has been in use for over \n1,000 years, having been developed in India, improved by Arab mathematicians in \nthe 12th century, and brought to the West in the 13th century by the Italian mathe-'),
  0.3343328833580017),
 (Document(metadata={'source': 'data/material.pdf', 'creationdate': 'D:20260123210259', 'page_label': '2', 'page': 1, 'creator': 'PDFium', 'total_pages': 110, 'producer': 'PDFium'}, page_content='32 Chapter 2 Representing and Manipulating Information \nMos computers store and process information represented as two-

## Init the LLM

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

In [22]:
test_res  = llm.invoke("What isbinary ?")

In [23]:
test_res

AIMessage(content='Binary is a number system used in computers, which consists of only two digits: 0 and 1. It is also known as base-2 and is the most fundamental number system in computing, as all data is ultimately represented in binary form at the hardware level. In binary, each digit is called a bit, and groups of 8 bits make up a byte. Binary numbers are used to represent and process data in computer systems, including encoding text, images, and other types of information.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 100, 'prompt_tokens': 11, 'total_tokens': 111, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-D1d2lnMKKUQwmX0QKbxZu9TOgZ3gW', 'service_tier': 'default', 'f

## Rag Chain LCEL

In [24]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough , RunnableParallel
from langchain_core.prompts import ChatPromptTemplate


In [26]:
custom_promt = ChatPromptTemplate.from_template(
    """
You are a teaching assistant. Use the context to help the student understand.

Rules:
- Use ONLY the provided context for factual information.
- Explain in simple terms first, then add more detail.
- If the question is about a concept, structure the answer as:
  1) Short definition
  2) Intuition (plain-English explanation)
  3) Tiny example
  4) Common mistake to avoid
- If the topic involves logic, reasoning, math, algorithms, or problem-solving AND the student asks for practice, include:
  - One short practice problem based ONLY on the context
  - Do NOT include the solution unless the student explicitly asks for it
- If the answer is not contained in the context, say:
  "I don't know based on the provided documents."

CONTEXT:
{context}

STUDENT QUESTION:
{question}

TEACHING ANSWER:


"""
)

In [None]:
retriever = vectorStore.as_retriever()


In [29]:
## Format the output document for the prompt
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [33]:
## Build the chain using lcel
rag_chain = (
    {"context":retriever| format_docs,
     "question":RunnablePassthrough()
    }
    | custom_promt 
    | llm 
    | StrOutputParser()

)



In [34]:
rag_chain

{
  context: VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E92BFFB110>, search_kwargs={})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nYou are a teaching assistant. Use the context to help the student understand.\n\nRules:\n- Use ONLY the provided context for factual information.\n- Explain in simple terms first, then add more detail.\n- If the question is about a concept, structure the answer as:\n  1) Short definition\n  2) Intuition (plain-English explanation)\n  3) Tiny example\n  4) Common mistake to avoid\n- If the topic involves logic, reasoning, math, algorithms, or problem-solving AND the student asks for pract

In [35]:
response = rag_chain.invoke("What is binary ?")

In [36]:
response

"1) Short definition:\nBinary is a number system that uses two digits, 0 and 1, to represent numbers and perform arithmetic operations.\n\n2) Intuition:\nIn binary, instead of using 10 digits like in our familiar decimal system, we only use 0 and 1. Each digit in a binary number represents a power of 2, so it's a way for computers to process and store information using just two values.\n\n3) Tiny example:\nIn decimal, the number 5 is written as 101 in binary because it represents 1*2^2 + 0*2^1 + 1*2^0.\n\n4) Common mistake to avoid:\nOne common mistake is not understanding the position value of each digit in a binary number. Every digit's position represents a power of 2, so getting the positions wrong can lead to incorrect calculations. \n\nPractice problem:\nWhat is the binary representation of the decimal number 9?"