In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
print("Environment Loaded Successfully")
print(f" API Key Found: {'OPENAI_API_KEY' in os.environ}")

if 'OPENAI_API_KEY' in os.environ:
    print(f"Key starts with: {os.environ['OPENAI_API_KEY'][:10]}...")
print("✅ Environment loaded")

Environment Loaded Successfully
 API Key Found: True
Key starts with: sk-proj-3Q...
✅ Environment loaded


# Core Langchain - Document Loaders, Text Splitters, Embedding, Vector Stroe, LLMs, 

In [3]:
# Document Loading
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("llm_fundamentals.pdf")
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"Content Preview: {documents[0].page_content[:50]}...")
print(f"Metadata Preview: {documents[0].metadata}")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 8 documents
Content Preview: @genieincodebottle 
Instagram | GitHub | Medium | ...
Metadata Preview: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-09-02T20:12:32+05:30', 'author': 'Rajesh Srivastava', 'moddate': '2025-09-02T20:12:32+05:30', 'source': 'llm_fundamentals.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len,
    separators = ["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"✅ Split {len(documents)} pages into {len(chunks)} chunks")
print(f"\nSample chunk:")
print(chunks[5].page_content)
print(f"\nMetadata: {chunks[5].metadata}")

✅ Split 8 pages into 37 chunks

Sample chunk:
5. Attention → Highlights the most relevant tokens in context 
6. Self-Attention → Each token attends to every other token for context 
7. Cross-Attention → Connect encoder and decoder (in encoder-decoder models) 
8. Multi-Head Attention → Several attention heads capture different patterns in parallel 
9. Feed-Forward Networks → Nonlinear layers that transform representations between 
attention blocks 
10. Residual Connections → Shortcut links that preserve signals and help gradient flow

Metadata: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-09-02T20:12:32+05:30', 'author': 'Rajesh Srivastava', 'moddate': '2025-09-02T20:12:32+05:30', 'source': 'llm_fundamentals.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}


In [5]:
#embedding
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name = "all-MiniLM-L6-v2"
)

test_text = "What is a large language model?"
test_embedding_vector = embeddings.embed_query(test_text)

print(f"Embedding Model loaded successfully.")
print(f"Embedding Dimension: {len(test_embedding_vector)}")
print(f"First 5 values: {test_embedding_vector[:5]}")

Embedding Model loaded successfully.
Embedding Dimension: 384
First 5 values: [0.045804012566804886, -0.08357568085193634, 0.008843314833939075, -0.02673397585749626, -0.05191471427679062]


In [6]:
# vector store
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents = chunks,
    embedding = embeddings,
    collection_name = "llm_fundamentals_collection"
)

print("✅ Vector store created with Chroma")
print(f"✅ Vector store created with {vectorstore._collection.count()} chunks")

✅ Vector store created with Chroma
✅ Vector store created with 37 chunks


In [7]:
# Test Similarity Search
query = "What is RAG?"
results = vectorstore.similarity_search(query, k=3)

print(f"Query: {query}\n")
for i, doc in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"{doc.page_content[:150]}...")
    print(f"Source: Page {doc.metadata['page']}\n")

Query: What is RAG?

Result 1:
17. ALiBi / Relative Positional Encoding → Alternative to RoPE for long contexts 
18. Linear / Performer Attention → Efficient attention variants for ...
Source: Page 1

Result 2:
evaluation 
4. Human Evaluation → Collect human judgments for accuracy, coherence, and safety 
5. Factuality / Truthfulness Metrics → Specialized eval...
Source: Page 5

Result 3:
@genieincodebottle 
Instagram | GitHub | Medium | YouTube 
How to Be Better Than Most in GenAI 
 
Contents 
 
Core LLM Building Blocks ..................
Source: Page 0



In [8]:
# LLMs
from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature = 0.7,
    api_key=os.environ["OPENAI_API_KEY"]
)

response = llm.invoke("Explain RAG in simple terms.")

print("LLM Initialized")
print(f"Response: {response.content}")


LLM Initialized
Response: RAG stands for "Retrieval-Augmented Generation." It is a method used in natural language processing that combines two approaches: retrieving information from a database or external source and generating text based on that information.

Here’s how it works in simple terms:

1. **Retrieval**: When you ask a question or give a prompt, the system first searches for relevant information from a large collection of documents or data. This could be anything from articles to books or other text sources.

2. **Augmentation**: After finding relevant pieces of information, the system uses these to help create a more informed and accurate response.

3. **Generation**: Finally, the system generates a response based on the retrieved information, combining it with its own understanding to produce a coherent answer.

In essence, RAG helps improve the quality and relevance of generated text by grounding it in real-world information, rather than relying solely on the model's tra

## Now lets combine all components into a RAG system using RetrievalQA form (simplest method)

In [9]:
from langchain_classic.chains.retrieval_qa.base import RetrievalQA

# RAG QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # stuff = put all context in prompt
    retriever = vectorstore.as_retriever(search_kwargs={"k":3}),
    return_source_documents = True
)

print("✅ RAG QA Chain created successfully.")

✅ RAG QA Chain created successfully.


In [10]:
# Test RAG QA Chain

question = "What is LoRA and how is it used in LLMs?"
result = qa_chain.invoke({"query": question})

print(f"Question: {question}\n")
print(f"Answer:\n{result['result']}\n")
print("="*80)
print(f"Sources ({len(result['source_documents'])} chunks):")
for i, doc in enumerate(result['source_documents'], 1):
    print(f"\n{i}. Page {doc.metadata['page']}:")
    print(f"   {doc.page_content[:100]}...")

Question: What is LoRA and how is it used in LLMs?

Answer:
LoRA (Low-Rank Adaptation) is a method used in the context of fine-tuning large language models (LLMs) by updating only small parts of the model instead of the entire model. It allows for efficient adaptation of pre-trained models to specific tasks or domains without the need for extensive computational resources. By focusing on low-rank updates, LoRA enables fine-tuning on modest hardware, making it feasible to work with large models.

Sources (3 chunks):

1. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...

2. Page 6:
   requirements 
11. Explainability / Interpretability Evaluation → Assess clarity and transparency of ...

3. Page 0:
   @genieincodebottle 
Instagram | GitHub | Medium | YouTube 
How to Be Better Than Most in GenAI 
 
Co...


In [11]:
# MMR avoids duplicate/similar results
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="mmr",  # Maximum Marginal Relevance
        search_kwargs={"k": 3, "fetch_k": 10}
    ),
    return_source_documents=True
)

print("✅ RAG QA Chain with MMR created successfully.")
# Test RAG QA Chain with MMR
question = "What is LoRA and how is it used in LLMs?"
result = qa_chain.invoke({"query": question})
print(f"Question: {question}\n")
print(f"Answer:\n{result['result']}\n")
print("="*80)
print(f"Sources ({len(result['source_documents'])} chunks):")
for i, doc in enumerate(result['source_documents'], 1):
    print(f"\n{i}. Page {doc.metadata['page']}:")
    print(f"   {doc.page_content[:100]}...")
    

✅ RAG QA Chain with MMR created successfully.
Question: What is LoRA and how is it used in LLMs?

Answer:
LoRA (Low-Rank Adaptation) is a method used in the fine-tuning of large language models (LLMs). It enables the adaptation of these models by updating only a small number of parameters, rather than the entire model. This approach is particularly useful because it allows for the fine-tuning of large models on modest hardware by reducing the computational resources required. In essence, LoRA adds low-rank matrices to the existing weight matrices of the model, which helps in efficiently capturing the necessary adjustments while keeping the base model intact.

Sources (3 chunks):

1. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...

2. Page 0:
   @genieincodebottle 
Instagram | GitHub | Medium | YouTube 
How to Be Better Than Most in GenAI 
 
Co...

3. Page 1:
   @genieincodebottle 
Core LLM Building Blocks 
1. Transforme

## USing LECL Method, building personal/custom chain

In [12]:
from langchain_core.prompts import ChatPromptTemplate # used for custom prompt
from langchain_core.output_parsers import StrOutputParser # used for custom output parsing
from langchain_core.runnables import RunnablePassthrough


# custom prompt
template = """You are an AI assistant helping users understand LLM fundamentals.
Answer the question based ONLY on the provided context. Cite page numbers when possible.

Context: 
{context}

Question: {question}

Answer: """

prompt = ChatPromptTemplate.from_template(template)

# custom output parser. Helper function for custom output parsing
def format_docs(docs):
    return "\n\n".join([
        f"[Page {doc.metadata['page']}]\n{doc.page_content}"
        for doc in docs
    ])

# build chain using LECL
rag_chain = (
    {"context": vectorstore.as_retriever() | format_docs,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()

)
print("✅ Custom RAG Chain created successfully.")

✅ Custom RAG Chain created successfully.


In [13]:
# Use the custom chain
question = "What is attention mechanism?"
answer = rag_chain.invoke(question)

print(f"Question: {question}\n")
print(f"Answer:\n{answer}")

Question: What is attention mechanism?

Answer:
The attention mechanism highlights the most relevant tokens in context, allowing the model to focus on specific parts of the input data that are more important for generating an output. This is essential for understanding relationships within the data and improving the overall performance of the model (Page 1).


## Memory Status 

In [21]:
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory import ConversationBufferMemory

# create conversation memory
memory = ConversationBufferMemory(
    memory_key = "chat_history",
    return_messages = True,
    output_key = "answer"
)

# create conversational retrieval chain
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = vectorstore.as_retriever(
        search_type = "mmr",
        search_kwargs = {"k":3, "fetch_k":10}
    ),
    memory = memory,
    return_source_documents = True
)
print("✅ Conversational Retrieval Chain created successfully.")

✅ Conversational Retrieval Chain created successfully.


In [22]:
# Test conversational chain with follow-up questions

# First question
result1 = conversational_chain.invoke({"question": "What is LoRA?"})
print("Q1: What is LoRA?")
print(f"A1: {result1['answer']}\n")
print("="*80 + "\n")

# Follow-up question (uses context from previous question!)
result2 = conversational_chain.invoke({"question": "What's the difference between that and QLoRA?"})
print("Q2: What's the difference between that and QLoRA?")
print(f"A2: {result2['answer']}\n")
print("="*80 + "\n")

# Another follow-up
result3 = conversational_chain.invoke({"question": "Can you give me a simple example of how it works?"})
print("Q3: Can you give me a simple example of how it works?")
print(f"A3: {result3['answer']}")

Q1: What is LoRA?
A1: LoRA stands for Low-Rank Adaptation, which is a method used for fine-tuning large models by updating only a small number of parameters. This allows for efficient model training on limited hardware resources.


Q2: What's the difference between that and QLoRA?
A2: LoRA (Low-Rank Adaptation) is a method that enables fine-tuning of large language models by updating only a small number of parameters, while QLoRA is an extension of LoRA that incorporates quantization. QLoRA allows for fine-tuning huge models on modest hardware by reducing the model size through quantization techniques, making it more efficient than standard LoRA.


Q3: Can you give me a simple example of how it works?
A3: I don't know.


## Complete RAG chain using a single Class

In [29]:
from typing import List, Dict

class LangChainRAG:
    """
    Production RAG system powered by LangChain.
    
    Why LangChain?
        - 10x less code than manual implementation
        - Easy to swap components (LLMs, vector stores, embeddings)
        - Built-in features (memory, streaming, error handling)
        - Production-tested by thousands of companies
    """
    
    def __init__(
        self,
        pdf_path: str,
        llm_model: str = "gpt-4o-mini",
        embedding_model: str = "all-MiniLM-L6-v2",
        chunk_size: int = 500,
        chunk_overlap: int = 50
    ):
        """
        Initialize RAG system from a PDF.
        
        Args:
            pdf_path: Path to PDF file
            llm_model: OpenAI model name
            embedding_model: HuggingFace embedding model
            chunk_size: Characters per chunk
            chunk_overlap: Overlap between chunks
        """
        print("Initializing LangChain RAG system...")
        
        # Load documents
        print(f"Loading PDF: {pdf_path}")
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        # Split documents
        print(f"Splitting into chunks (size={chunk_size}, overlap={chunk_overlap})")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        self.chunks = text_splitter.split_documents(documents)
        
        # Setup embeddings
        print(f"Loading embedding model: {embedding_model}")
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        
        # Create vector store
        print("Creating vector store...")
        self.vectorstore = Chroma.from_documents(
            documents=self.chunks,
            embedding=self.embeddings,
            collection_name="langchain_rag"
        )
        
        # Setup LLM
        print(f"Initializing LLM: {llm_model}")
        self.llm = ChatOpenAI(
            model=llm_model,
            temperature=0.3,
            api_key=os.environ["OPENAI_API_KEY"]
        )
        
        # Create QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        
        print(f"✅ RAG system ready! ({len(self.chunks)} chunks indexed)\n")
    
    def ask(self, question: str) -> Dict:
        """
        Ask a question and get an answer with sources.
        
        Args:
            question: User's question
            
        Returns:
            Dictionary with answer and source information
        """
        result = self.qa_chain.invoke({"query": question})
        
        return {
            "question": question,
            "answer": result['result'],
            "sources": [
                {
                    "page": doc.metadata.get('page', 'N/A'),
                    "text": doc.page_content[:150] + "..."
                }
                for doc in result['source_documents']
            ]
        }
    
    def ask_multiple(self, questions: List[str]) -> List[Dict]:
        """
        Ask multiple questions at once.
        
        Args:
            questions: List of questions
            
        Returns:
            List of results
        """
        return [self.ask(q) for q in questions]

print("✅ LangChainRAG class defined")

✅ LangChainRAG class defined


In [30]:
# test LangChainRAG class
rag = LangChainRAG(
    pdf_path="llm_fundamentals.pdf",
    llm_model="gpt-4o-mini"
)

Initializing LangChain RAG system...
Loading PDF: llm_fundamentals.pdf
Splitting into chunks (size=500, overlap=50)
Loading embedding model: all-MiniLM-L6-v2
Creating vector store...
Initializing LLM: gpt-4o-mini
✅ RAG system ready! (37 chunks indexed)



In [31]:
# Ask questions
result = rag.ask("What are the main components of transformer architecture?")

print(f"Question: {result['question']}\n")
print(f"Answer:\n{result['answer']}\n")
print("="*80)
print(f"Sources ({len(result['sources'])} chunks):")
for i, source in enumerate(result['sources'], 1):
    print(f"\n{i}. Page {source['page']}:")
    print(f"   {source['text']}")

Question: What are the main components of transformer architecture?

Answer:
The main components of transformer architecture include:

1. **Encoder-Decoder Structure**: The transformer consists of an encoder that processes the input and a decoder that generates the output.
2. **Self-Attention Mechanism**: This allows the model to weigh the importance of different words in a sentence when encoding or decoding.
3. **Feedforward Neural Networks**: These are applied to the output of the self-attention mechanism to further process the information.
4. **Positional Encoding**: This adds information about the position of tokens in the sequence to the embeddings, as transformers do not have a built-in sense of order.
5. **Layer Normalization**: This is used to stabilize and accelerate the training of the model.
6. **Residual Connections**: These connections help in training deep networks by allowing gradients to flow through the network more easily.

These components work together to enable the

In [32]:
# Multiple questions
questions = [
    "What is RLHF?",
    "Explain quantization",
    "What are vector databases?"
]

results = rag.ask_multiple(questions)

for result in results:
    print(f"\n{'='*80}")
    print(f"Q: {result['question']}")
    print(f"A: {result['answer']}")
    print(f"Sources: Pages {[s['page'] for s in result['sources']]}")


Q: What is RLHF?
A: RLHF stands for Reinforcement Learning from Human Feedback. It is a method used to align model outputs with human preferences by incorporating feedback from humans during the training process. This approach helps improve the quality and relevance of the model's responses based on what humans find desirable.
Sources: Pages [3, 2, 7]

Q: Explain quantization
A: Quantization is a process used in machine learning and deep learning to reduce the precision of the numbers used to represent model parameters and activations. This typically involves converting floating-point numbers (which can represent a wide range of values) into lower-precision formats, such as integers. The main goals of quantization are to decrease the model size, reduce memory bandwidth, and improve inference speed, all while attempting to maintain the model's accuracy.

There are different types of quantization, such as post-training quantization, where a pre-trained model is quantized after training,