In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
print("Environment Loaded Successfully")
print(f" API Key Found: {'OPENAI_API_KEY' in os.environ}")

if 'OPENAI_API_KEY' in os.environ:
    print(f"Key starts with: {os.environ['OPENAI_API_KEY'][:10]}...")
print("✅ Environment loaded")

Environment Loaded Successfully
 API Key Found: True
Key starts with: sk-proj-3Q...
✅ Environment loaded


# Core Langchain - Document Loaders, Text Splitters, Embedding, Vector Stroe, LLMs, 

In [3]:
# Document Loading
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("llm_fundamentals.pdf")
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"Content Preview: {documents[0].page_content[:50]}...")
print(f"Metadata Preview: {documents[0].metadata}")

Loaded 8 documents
Content Preview: @genieincodebottle 
Instagram | GitHub | Medium | ...
Metadata Preview: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-09-02T20:12:32+05:30', 'author': 'Rajesh Srivastava', 'moddate': '2025-09-02T20:12:32+05:30', 'source': 'llm_fundamentals.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len,
    separators = ["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"✅ Split {len(documents)} pages into {len(chunks)} chunks")
print(f"\nSample chunk:")
print(chunks[5].page_content)
print(f"\nMetadata: {chunks[5].metadata}")

✅ Split 8 pages into 37 chunks

Sample chunk:
5. Attention → Highlights the most relevant tokens in context 
6. Self-Attention → Each token attends to every other token for context 
7. Cross-Attention → Connect encoder and decoder (in encoder-decoder models) 
8. Multi-Head Attention → Several attention heads capture different patterns in parallel 
9. Feed-Forward Networks → Nonlinear layers that transform representations between 
attention blocks 
10. Residual Connections → Shortcut links that preserve signals and help gradient flow

Metadata: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-09-02T20:12:32+05:30', 'author': 'Rajesh Srivastava', 'moddate': '2025-09-02T20:12:32+05:30', 'source': 'llm_fundamentals.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}


In [None]:
#embedding
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name = "all-MiniLM-L6-v2"
)

test_text = "What is a large language model?"
test_embedding_vector = embeddings.embed_query(test_text)

print(f"Embedding Model loaded successfully.")
print(f"Embedding Dimension: {len(test_embedding_vector)}")
print(f"First 5 values: {test_embedding_vector[:5]}")

Embedding Model loaded successfully.
Embedding Dimension: 384
First 5 values: [0.045804012566804886, -0.08357568085193634, 0.008843314833939075, -0.02673397585749626, -0.05191471427679062]


In [9]:
# vector store
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents = chunks,
    embedding = embeddings,
    collection_name = "llm_fundamentals_collection"
)

print("✅ Vector store created with Chroma")
print(f"✅ Vector store created with {vectorstore._collection.count()} chunks")

✅ Vector store created with Chroma
✅ Vector store created with 111 chunks


In [10]:
# Test Similarity Search
query = "What is RAG?"
results = vectorstore.similarity_search(query, k=3)

print(f"Query: {query}\n")
for i, doc in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"{doc.page_content[:150]}...")
    print(f"Source: Page {doc.metadata['page']}\n")

Query: What is RAG?

Result 1:
17. ALiBi / Relative Positional Encoding → Alternative to RoPE for long contexts 
18. Linear / Performer Attention → Efficient attention variants for ...
Source: Page 1

Result 2:
17. ALiBi / Relative Positional Encoding → Alternative to RoPE for long contexts 
18. Linear / Performer Attention → Efficient attention variants for ...
Source: Page 1

Result 3:
17. ALiBi / Relative Positional Encoding → Alternative to RoPE for long contexts 
18. Linear / Performer Attention → Efficient attention variants for ...
Source: Page 1



In [13]:
# LLMs
from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature = 0.7,
    api_key=os.environ["OPENAI_API_KEY"]
)

response = llm.invoke("Explain RAG in simple terms.")

print("LLM Initialized")
print(f"Response: {response.content}")


LLM Initialized
Response: RAG stands for "Retrieval-Augmented Generation." It's a method used in natural language processing and artificial intelligence to improve how machines understand and generate text.

Here’s a simple breakdown:

1. **Retrieval**: When a question or prompt is given, the system first searches a large database or set of documents to find relevant information. This is like looking up facts or details in a library before answering a question.

2. **Augmentation**: The system then takes the information it found and uses it to help generate a more accurate and informed response. This step enhances the output by adding context and details that the model may not have remembered on its own.

3. **Generation**: Finally, the system combines the retrieved information with its own language skills to create a coherent and relevant answer.

In summary, RAG works by looking up information and using it to craft better answers, making the responses more accurate and informative.


## Now lets combine all components into a RAG system using RetrievalQA form (simplest method)

In [16]:
from langchain_classic.chains.retrieval_qa.base import RetrievalQA

# RAG QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # stuff = put all context in prompt
    retriever = vectorstore.as_retriever(search_kwargs={"k":3}),
    return_source_documents = True
)

print("✅ RAG QA Chain created successfully.")

✅ RAG QA Chain created successfully.


In [18]:
# Test RAG QA Chain

question = "What is LoRA and how is it used in LLMs?"
result = qa_chain.invoke({"query": question})

print(f"Question: {question}\n")
print(f"Answer:\n{result['result']}\n")
print("="*80)
print(f"Sources ({len(result['source_documents'])} chunks):")
for i, doc in enumerate(result['source_documents'], 1):
    print(f"\n{i}. Page {doc.metadata['page']}:")
    print(f"   {doc.page_content[:100]}...")

Question: What is LoRA and how is it used in LLMs?

Answer:
LoRA (Low-Rank Adaptation) is a method used in fine-tuning large language models (LLMs) by updating only small parts of the model instead of the entire model. It allows for efficient adaptation of these models to specific tasks or datasets, reducing the computational resources required for training. By focusing on low-rank updates, LoRA can maintain performance while significantly lowering the memory and processing power needed, making it feasible to fine-tune large models on more modest hardware.

Sources (3 chunks):

1. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...

2. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...

3. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...


In [21]:
# MMR avoids duplicate/similar results
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="mmr",  # Maximum Marginal Relevance
        search_kwargs={"k": 3, "fetch_k": 10}
    ),
    return_source_documents=True
)

print("✅ RAG QA Chain with MMR created successfully.")
# Test RAG QA Chain with MMR
question = "What is LoRA and how is it used in LLMs?"
result = qa_chain.invoke({"query": question})
print(f"Question: {question}\n")
print(f"Answer:\n{result['result']}\n")
print("="*80)
print(f"Sources ({len(result['source_documents'])} chunks):")
for i, doc in enumerate(result['source_documents'], 1):
    print(f"\n{i}. Page {doc.metadata['page']}:")
    print(f"   {doc.page_content[:100]}...")
    

✅ RAG QA Chain with MMR created successfully.
Question: What is LoRA and how is it used in LLMs?

Answer:
LoRA (Low-Rank Adaptation) is a method used in large language models (LLMs) that allows for efficient fine-tuning by updating only small parts of the model while keeping the majority of its parameters frozen. This approach helps to adapt large models to specific tasks without the need for extensive computational resources. LoRA achieves this by introducing low-rank matrices into the model's architecture, which enables it to learn task-specific adaptations efficiently.

Sources (3 chunks):

1. Page 2:
   9. QLoRA → LoRA + quantization, enabling fine-tuning of huge models on modest hardware 
10. PEFT → F...

2. Page 6:
   requirements 
11. Explainability / Interpretability Evaluation → Assess clarity and transparency of ...

3. Page 0:
   @genieincodebottle 
Instagram | GitHub | Medium | YouTube 
How to Be Better Than Most in GenAI 
 
Co...


## USing LECL Method, building personal/custom chain

In [None]:
from langchain_core.prompts import ChatPromptTemplate # used for custom prompt
from langchain_core.output_parsers import StrOutputParser # used for custom output parsing
from langchain_core.runnables import RunnablePassthrough


# custom prompt
template = """You are an AI assistant helping users understand LLM fundamentals.
Answer the question based ONLY on the provided context. Cite page numbers when possible.

Context: 
{context}

Question: {question}

Answer: """

prompt = ChatPromptTemplate.from_template(template)

# custom output parser. Helper function for custom output parsing
def format_docs(docs):
    return "\n\n".join([
        f"[Page {doc.metadata['page']}]\n{doc.page_content}"
        for doc in docs
    ])

# build chain using LECL
rag_chain = (
    {"context": vectorstore.as_retriever() | format_docs,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()

)
print("✅ Custom RAG Chain created successfully.")

SyntaxError: invalid syntax (4232659719.py, line 28)

In [27]:
# Use the custom chain
question = "What is attention mechanism?"
answer = rag_chain.invoke(question)

print(f"Question: {question}\n")
print(f"Answer:\n{answer}")

KeyError: "Input to ChatPromptTemplate is missing variables {'question'}.  Expected: ['context', 'question'] Received: ['context', 'Question']\nNote: if you intended {question} to be part of the string and not a variable, please escape it with double curly braces like: '{{question}}'.\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/INVALID_PROMPT_INPUT "