# Langchain documents

In [1]:
from langchain_core.documents import Document
from datetime import datetime, UTC

doc = Document(
    page_content='Hii this is the page content of a document',
    metadata = {
        "source" : "Manually typed",
        "author" : "Sparsh Sahu",
        "page" : 1,
        "date_created" :  datetime.now(UTC),
    }
)
doc

Document(metadata={'source': 'Manually typed', 'author': 'Sparsh Sahu', 'page': 1, 'date_created': datetime.datetime(2025, 11, 29, 14, 49, 13, 162951, tzinfo=datetime.timezone.utc)}, page_content='Hii this is the page content of a document')

# Textfile to Document

In [2]:
import os
os.makedirs("../data/text_files", exist_ok = True)

In [6]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../data/text_files/sample.txt')
doc2 = loader.load()
# doc2
# print(doc2[0].metadata)

In [7]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    path = "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True,
)
doc3 = dir_loader.load()
doc3[0].metadata

100%|██████████| 1/1 [00:00<00:00, 995.80it/s]


{'source': '..\\data\\text_files\\sample.txt'}

# Splitting docs

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    
    return split_docs

In [10]:
chunks = split_documents(doc3)
len(chunks)

24

# Embedding and VectorDB

In [11]:
import transformers
import sentence_transformers

print(f"Transformers version: {transformers.__version__}")
print(f"Sentence Transformers version: {sentence_transformers.__version__}")
print(f"Transformers path: {transformers.__file__}")

Transformers version: 4.57.3
Sentence Transformers version: 5.1.2
Transformers path: e:\ROBOTRONIX\Learning Period\venv\Lib\site-packages\transformers\__init__.py


In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [13]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        self.model = SentenceTransformer(self.model_name)
        
    def generate_embeddings(self, text: List[str]):
        embeddings = self.model.encode(text, show_progress_bar=True)
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager

<__main__.EmbeddingManager at 0x14ac0e03da0>

In [14]:
class VectorStore:
    def __init__(self, collection_name: str = 'pdf_docs', persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={
                "description": "Document embeddings for RAG Pipeline",
                "hnsw:space": "cosine"
            }
        )
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc,embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['context_length'] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
            
        # Adding to db 
        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text
        )
        
vector_store = VectorStore()
vector_store
        

<__main__.VectorStore at 0x14ac031d250>

In [15]:
texts = [chunk.page_content for chunk in chunks]
print(len(texts))

embedding = embedding_manager.generate_embeddings(texts)

vector_store.add_documents(chunks, embedding)

24


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


# RAG Retriever

In [16]:
def retrieve(query: str, top_k: int = 5, score_threshold: float = 0.5):
    query_embeddings = embedding_manager.generate_embeddings([query])
    
    # Search in vectorstore
    results = vector_store.collection.query(
        query_embeddings=query_embeddings,
        n_results=top_k,
    )
    return results

results = retrieve(query='What is uv python package?')

Batches: 100%|██████████| 1/1 [00:00<00:00, 78.41it/s]


In [17]:
results['data']

In [22]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.4):
        query_embeddings = self.embedding_manager.generate_embeddings([query])
        
        # Search in vectorstore
        results = self.vector_store.collection.query(
            query_embeddings=query_embeddings,
            n_results=top_k,
        )
        retrieved_docs = []
    
        ids = results['ids'][0]
        documents = results['documents'][0]
        included = results['included'][0]
        metadatas = results['metadatas'][0]
        distances = results['distances'][0]
        
        for i, (doc_id, document, include, metadata, distance) in enumerate(zip(ids, documents, included, metadatas, distances)):
            similarity_score = 1 - distance
            print(similarity_score)
            if similarity_score >= score_threshold:
                retrieved_docs.append({
                    'id': doc_id,
                    'content': document,
                    'included': include,
                    'metadata': metadata,
                    'similarity_score': similarity_score,
                    'distance': distance,
                    'rank': i + 1
                })
                
        
        return retrieved_docs
    
rag_retriever=RAGRetriever(vector_store,embedding_manager)

In [24]:
response = rag_retriever.retrieve(query='What is Transformers and how to install it?')

Batches: 100%|██████████| 1/1 [00:00<00:00, 119.34it/s]

0.5071104764938354
0.419556200504303
0.4033973217010498
0.3981723189353943
0.38645392656326294





# Actual Pipeline

In [26]:
from dotenv import load_dotenv
load_dotenv()

True

In [27]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage

In [47]:
class GroqLLM:
    def __init__(self, model_name: str = 'openai/gpt-oss-20b', api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key
        if not api_key:
            print("No api key found")
            
        self.llm = ChatGroq(
            api_key=self.api_key,
            model=self.model_name,
            temperature=0.1,
            max_tokens=1024,
        )
        
    def generate_response(self, query: str, context: str, max_length: int = 500):
        prompt = PromptTemplate(
            input_variables = ['context', 'question'],
            template="""
            You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.
            Context: {context}.
            Question: {question}.
            Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        )
        formatted_prompt = prompt.format(context=context, question=query)
        
        message = HumanMessage(content=formatted_prompt)
        response = self.llm.invoke([message])
        
        return response
    
    def generate_response_simple(self, query: str, context: str):
        prompt = f"""You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.
            Context: {context}.
            Question: {query}.
            Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        message = HumanMessage(content=prompt)
        response = self.llm.invoke([message])
        
        return response

In [48]:
groq_llm = GroqLLM(api_key=os.environ['GROQ_API_KEY'])

In [49]:
context = rag_retriever.retrieve("What are Large Language Models?")

Batches: 100%|██████████| 1/1 [00:00<00:00, 110.71it/s]

0.7350057363510132
0.5937032699584961
0.5174210667610168
0.4836157560348511
0.46603983640670776





In [50]:
response = groq_llm.generate_response_simple(query="What are Large Language Models?", context=context)
print(response.content)

Large Language Models (LLMs) are advanced AI systems built on deep neural‑network architectures—most commonly the Transformer. They are trained on massive text corpora (books, articles, websites, etc.) and contain billions of parameters, which lets them learn patterns, grammar, and contextual meaning from language.  

Key characteristics:

| Feature | What it means |
|---------|---------------|
| **Transformer‑based** | Uses self‑attention to capture long‑range dependencies between words. |
| **Massive training data** | Learns from diverse, large‑scale text collections. |
| **Large parameter count** | Enables nuanced understanding and generation of text. |
| **Fine‑tuning** | Can be adapted to specific tasks (e.g., translation, summarization, code generation). |
| **Applications** | Chatbots, translation, content creation, question answering, debugging, documentation, etc. |
| **Challenges** | Requires significant compute for training, can inherit biases from training data, needs caref

In [51]:
response = groq_llm.generate_response(query="What are Large Language Models?", context=context)
print(response.content)

Large Language Models (LLMs) are advanced AI systems built on deep neural‑network architectures—most commonly the Transformer. They are trained on massive text corpora (books, articles, websites, etc.) and contain billions of parameters. This training lets them learn patterns, grammar, and contextual meaning, enabling them to:

* **Understand** and generate human‑like text  
* **Answer questions** and provide explanations  
* **Translate languages** and summarize content  
* **Assist with code generation, debugging, and documentation**  

LLMs can be fine‑tuned for specific tasks or domains, making them versatile tools for chatbots, translation systems, content creation, and more. Examples include OpenAI’s ChatGPT, Google Gemini, Anthropic Claude, Meta’s LLaMA, and many open‑source variants.


# Simple RAG


In [60]:
llm = ChatGroq(
    api_key=os.environ['GROQ_API_KEY'],
    model='llama-3.1-8b-instant',
    temperature=0.1,
    max_tokens=1024,
)

def rag_simple(query, retriever, llm, top_k=3):
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    prompt = f"""
        Use the following context to answer the question concisely.
        Context: {context}
        Question: {query}
        Answer: """
    response = llm.invoke([prompt.format(context,query)])
    return response.content

In [53]:
answer=rag_simple("What is attention mechanism?",rag_retriever,llm)
print(answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 14.07it/s]


0.5590577125549316
0.5001290440559387
0.47953134775161743
An attention mechanism lets a model weigh different parts of an input sequence when computing each output element. In Transformers, self‑attention (or encoder‑decoder attention) calculates a weighted sum of value vectors, where the weights are derived from similarity scores between query and key vectors. This allows the model to focus on the most relevant tokens for each prediction, enabling richer contextual representations.


# Advance RAG Pipeline

### 1. RAG With Source

In [61]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    
    if return_context:
        output['context'] = context
    return output

In [62]:
result = rag_advanced('What is Masked self attention mechanism?', rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Batches: 100%|██████████| 1/1 [00:00<00:00, 107.78it/s]


0.6446174383163452
0.4828724265098572
0.46240782737731934
Answer: The Masked Self-Attention Mechanism is a sub-layer in the Transformer architecture that prevents the model from attending to future tokens, maintaining the autoregressive property during generation.
Sources: [{'source': '..\\data\\text_files\\sample.txt', 'page': 'unknown', 'score': 0.6446174383163452, 'preview': "Masked Self-Attention Mechanism: Similar to the encoder's self-attention mechanism but its main purpose is to prevent attending to future tokens to maintain the autoregressive property (no cheating during generation).\nEncoder-Decoder Attention Mechanism: This sub-layer allows the decoder to focus on..."}, {'source': '..\\data\\text_files\\sample.txt', 'page': 'unknown', 'score': 0.4828724265098572, 'preview': 'Feed-Forward Network: The output from the self-attention mechanism is passed through a position-wise feed-forward network.\nLayer Normalization and Residual Connections: Layer normalization and residual 

### 2. Enhanced RAG Advanced Pipeline

In [64]:
import time

In [65]:
class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]

            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("what is attention is all you need", top_k=3, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Batches: 100%|██████████| 1/1 [00:00<00:00, 114.41it/s]

0.4813169240951538
0.382609486579895
0.37102580070495605
Streaming answer:
Use the following context to answer the question concisely.
Context:
Improved ability to capture complex patterns in the data.
Enhanced model capacity without significant increase in computational complexity.
Mathematical Formulation:
Given an input sequence X the self-attention mechanism computes three matrices: queri




es Q, keys K and values V by multiplying X with learned weight matrices 
W
Q
W 
Q
​
 ​, 
W
K
W 
K
​
 ​ and 
W
V
W 
V
​
 .

Q
=
X
W
Q
,
K
=
X
W
K
,
V
=
X
W
V
Q=XW 
Q
​
 ,K=XW 
K
​
 ,V=XW 
V
​
 

The attention scores are computed as:

Attention
(
Q
,
K
,
V
)
=
softmax
(
Q
K
T
d
k
)
Attention(Q,K,V)=softmax( 
d 
k
​
 
​
 
QK 
T
 
​
 )

For multi-head attention, we apply self-attention multiple times:

MultiHead
(
Q
,
K
,
V
)
=
Concat
(
head
1
,
…
,
head
h
)
W
O
MultiHead(Q,K,V)=Concat(head 
1
​
 ,…,head 
h
​
 )W 
O
​
 

where Where each head is computed as:

head
i
=
Attention
(
Q
W
Q
i
,
K
W
K
i
,
V
W
V
i
)
head 
i
​
 =Attention(QW 
Q
i
​
 ,KW 
K
i
​
 ,VW 
V
i
​
 )

Masked Self-Attention Mechanism: Similar to the encoder's self-attention mechanism but its main purpose is to prevent attending to future tokens to maintain the autoregressive property (no cheating during generation).
Encoder-Decoder Attention Mechanism: This sub-layer allows the decoder to focus on relevant parts of the enco