In [2]:

# =============================================================================
# RAG PIPELINE WITH LANGCHAIN
# Building a Retrieval-Augmented Generation System
# =============================================================================

# =============================================================================
# INSTALLATION (Run once per environment)
# =============================================================================

%pip install langchain langchain-community chromadb sentence-transformers \
             pypdf beautifulsoup4 requests python-dotenv langchain-ollama \
             matplotlib numpy youtube-transcript-api pytube \
             "unstructured[all-docs]" pillow


Collecting pi_heif (from unstructured[all-docs])
  Downloading pi_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.5 kB)
Collecting pdf2image (from unstructured[all-docs])
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfminer.six (from unstructured[all-docs])
  Downloading pdfminer_six-20251229-py3-none-any.whl.metadata (4.3 kB)
Collecting unstructured-inference>=1.1.1 (from unstructured[all-docs])
  Downloading unstructured_inference-1.1.2-py3-none-any.whl.metadata (5.5 kB)
Collecting google-cloud-vision (from unstructured[all-docs])
  Downloading google_cloud_vision-3.11.0-py3-none-any.whl.metadata (9.8 kB)
Collecting python-docx>=1.1.2 (from unstructured[all-docs])
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting onnx>=1.17.0 (from unstructured[all-docs])
  Downloading onnx-1.20.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting unstructured.pytessera

In [3]:
# Core utilities
import os
import time
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple
import numpy as np
import matplotlib.pyplot as plt

# Environment & document processing
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    PyPDFLoader,
    WebBaseLoader,
    TextLoader,
    YoutubeLoader,
    UnstructuredImageLoader,
)
from langchain_core.prompts import PromptTemplate

# Vector database & embeddings
import chromadb
from sentence_transformers import SentenceTransformer

# LLM interfaces
from langchain_ollama.llms import OllamaLLM           # standard LLM
from langchain_ollama.chat_models import ChatOllama  # chat interface
from langchain_ollama.embeddings import OllamaEmbeddings

# Web scraping (you might still use it somewhere else)
import requests
from bs4 import BeautifulSoup

print("‚úì All imports successful")




‚úì All imports successful


In [4]:
# =============================================================================
# 2. CONFIGURATION
# =============================================================================

# Load environment variables
load_dotenv()

# Pipeline configuration
CONFIG = {
    'chunk_size': 1000,
    'chunk_overlap': 200,
    'top_k_retrieval': 5,
    'llm_model': 'llama3.1',
    'collection_name': 'rag_documents',
    'embedding_model': 'all-MiniLM-L6-v2'
}

# Create project directories
Path("documents").mkdir(exist_ok=True)
Path("results").mkdir(exist_ok=True)

print("‚úì Configuration loaded")
print(f"   LLM Model: {CONFIG['llm_model']}")
print(f"   Chunk size: {CONFIG['chunk_size']}, Overlap: {CONFIG['chunk_overlap']}")


‚úì Configuration loaded
   LLM Model: llama3.1
   Chunk size: 1000, Overlap: 200


In [9]:
# =============================================================================
# A. DOCUMENT LOADING & PREPROCESSING
# =============================================================================

class DocumentProcessor:
    """Handles loading and chunking documents from multiple sources."""

    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""],
        )
        self.documents = []   # raw documents (PDF pages, web pages, etc.)
        self.chunks = []      # smaller chunks after splitting
        print(f"‚úì DocumentProcessor initialized (chunk_size={chunk_size}, overlap={chunk_overlap})")

    # ---------- helper ----------
    def _add_type_and_source(self, docs, source: str, doc_type: str):
        """Normalize metadata for loaded documents and store them."""
        for d in docs:
            if d.metadata is None:
                d.metadata = {}
            d.metadata.setdefault("source", source)
            d.metadata["type"] = doc_type
        self.documents.extend(docs)

    # ---------- 1) PDFs ----------
    def load_pdf(self, pdf_path: str):
        """Load and process PDF document."""
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()   # typically one Document per page
        self._add_type_and_source(docs, source=pdf_path, doc_type="pdf")
        print(f"   üìÑ Loaded PDF: {pdf_path} ({len(docs)} pages)")

    # ---------- 2) Text files (e.g., textbook chapters) ----------
    def load_text(self, text_path: str):
        """Load plain text file."""
        loader = TextLoader(text_path, encoding="utf-8")
        docs = loader.load()
        self._add_type_and_source(docs, source=text_path, doc_type="text")
        print(f"   üìò Loaded text file: {text_path} ({len(docs)} document(s))")

    # ---------- 3) Web articles ----------
    def load_web(self, url: str):
        """Scrape and load web content."""
        loader = WebBaseLoader(url)
        docs = loader.load()
        self._add_type_and_source(docs, source=url, doc_type="web")
        print(f"   üåê Loaded web article: {url} ({len(docs)} document(s))")

    # ---------- 4) YouTube videos (transcripts) ----------
    def load_youtube_video(self, url: str):
        """Load YouTube video transcript as document(s)."""
        try:
            loader = YoutubeLoader.from_youtube_url(
                url,
                add_video_info=True,   # adds title, channel, etc. to metadata
            )
            docs = loader.load()
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to load YouTube video: {url}")
            print(f"   Reason: {e}")
            return  # don't crash the whole pipeline

        self._add_type_and_source(docs, source=url, doc_type="video")
        print(f"   üé• Loaded YouTube video transcript: {url} ({len(docs)} document(s))")

    # ---------- 5) Images (OCR via Unstructured) ----------
    def load_image(self, image_path: str):
        """Load image and extract text using UnstructuredImageLoader."""
        loader = UnstructuredImageLoader(image_path)
        docs = loader.load()
        self._add_type_and_source(docs, source=image_path, doc_type="image")
        print(f"   üñºÔ∏è Loaded image: {image_path} ({len(docs)} document(s))")

    # ---------- Chunking ----------
    def chunk_documents(self):
        """Split all loaded documents into chunks."""
        if not self.documents:
            print("‚ö†Ô∏è No documents loaded. Nothing to chunk.")
            self.chunks = []
            return

        self.chunks = self.text_splitter.split_documents(self.documents)

        # Add chunk_id and normalize page info if available
        for idx, chunk in enumerate(self.chunks):
            if chunk.metadata is None:
                chunk.metadata = {}
            chunk.metadata.setdefault("chunk_id", idx)

            # Normalize page number: different loaders may use 'page' or 'page_number'
            if "page" in chunk.metadata:
                page_number = chunk.metadata["page"]
            elif "page_number" in chunk.metadata:
                page_number = chunk.metadata["page_number"]
            else:
                page_number = None
            chunk.metadata["page_number"] = page_number

        print(f"‚úì Created {len(self.chunks)} chunks from {len(self.documents)} documents.")

    def get_chunks_with_metadata(self) -> Tuple[List[str], List[Dict]]:
        """Return separate lists of chunk texts and metadata."""
        texts = [str(c.page_content) for c in self.chunks]
        metadatas = [dict(c.metadata) if c.metadata is not None else {} for c in self.chunks]
        return texts, metadatas

    def print_stats(self):
        """Display document processing statistics."""
        print(f"\nüìä Document Processing Stats:")
        print(f"   Total documents loaded: {len(self.documents)}")
        print(f"   Total chunks created: {len(self.chunks)}")
        if self.chunks:
            avg_len = sum(len(str(c.page_content)) for c in self.chunks) / len(self.chunks)
            print(f"   Average chunk length: {avg_len:.0f} characters")

# Initialize document processor
doc_processor = DocumentProcessor(
    chunk_size=CONFIG['chunk_size'],
    chunk_overlap=CONFIG['chunk_overlap']
)


‚úì DocumentProcessor initialized (chunk_size=1000, overlap=200)


In [10]:
# =============================================================================
# DOCUMENT LOADING SECTION
# =============================================================================
print("\n" + "="*60)
print("LOADING DOCUMENTS")
print("="*60)

pdf_files = [
    "documents/rag_paper.pdf",
    "documents/transformers_paper.pdf",
    "documents/dpr_paper.pdf"
]

web_urls = [
    "https://python.langchain.com/docs/get_started/introduction",
    "https://docs.trychroma.com/usage-guide",
    "https://www.promptingguide.ai/techniques/rag",
]

text_files = [
    "documents/rag_textbook_chapter.txt",
]

youtube_urls = [
    "https://www.youtube.com/watch?v=O5nskjZ_GoI"   # DeepMind transformers video
]

image_files = [
    "documents/diagram_rag_pipeline.png",
    "documents/whiteboard_notes_llm.jpg",
]

# ----- Load all documents -----
for pdf in pdf_files:
    if Path(pdf).exists():
        doc_processor.load_pdf(pdf)
    else:
        print(f"   ‚ö†Ô∏è PDF not found: {pdf}")

for url in web_urls:
    doc_processor.load_web(url)

for txt in text_files:
    if Path(txt).exists():
        doc_processor.load_text(txt)
    else:
        print(f"   ‚ö†Ô∏è Text file not found: {txt}")

for url in youtube_urls:
    doc_processor.load_youtube_video(url)

for img in image_files:
    if Path(img).exists():
        doc_processor.load_image(img)
    else:
        print(f"   ‚ö†Ô∏è Image not found: {img}")

print(f"\n‚úì Total documents loaded: {len(doc_processor.documents)}")

# ----- Chunk Documents -----
doc_processor.chunk_documents()
doc_processor.print_stats()

# ----- Preview -----
if doc_processor.chunks:
    print("\nüìÑ First chunk preview:")
    print(doc_processor.chunks[0].page_content[:300] + "...")



LOADING DOCUMENTS
   üìÑ Loaded PDF: documents/rag_paper.pdf (19 pages)
   üìÑ Loaded PDF: documents/transformers_paper.pdf (15 pages)
   üìÑ Loaded PDF: documents/dpr_paper.pdf (13 pages)
   üåê Loaded web article: https://python.langchain.com/docs/get_started/introduction (1 document(s))
   üåê Loaded web article: https://docs.trychroma.com/usage-guide (1 document(s))
   üåê Loaded web article: https://www.promptingguide.ai/techniques/rag (1 document(s))
   üìò Loaded text file: documents/rag_textbook_chapter.txt (1 document(s))
‚ö†Ô∏è Failed to load YouTube video: https://www.youtube.com/watch?v=O5nskjZ_GoI
   Reason: HTTP Error 400: Bad Request


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

   üñºÔ∏è Loaded image: documents/diagram_rag_pipeline.png (1 document(s))
   üñºÔ∏è Loaded image: documents/whiteboard_notes_llm.jpg (1 document(s))

‚úì Total documents loaded: 53
‚úì Created 237 chunks from 53 documents.

üìä Document Processing Stats:
   Total documents loaded: 53
   Total chunks created: 237
   Average chunk length: 869 characters

üìÑ First chunk preview:
Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis‚Ä†‚Ä°, Ethan Perez‚ãÜ,
Aleksandra Piktus‚Ä†, Fabio Petroni‚Ä†, Vladimir Karpukhin‚Ä†, Naman Goyal‚Ä†, Heinrich K√ºttler‚Ä†,
Mike Lewis‚Ä†, Wen-tau Yih‚Ä†, Tim Rockt√§schel‚Ä†‚Ä°, Sebastian Riedel‚Ä†‚Ä°, Douwe Kiela‚Ä†
‚Ä†Facebook AI Research; ‚Ä°University ...


In [11]:
def run_chunk_experiment(chunk_size, chunk_overlap):
    print("\n" + "="*60)
    print(f"CHUNKING EXPERIMENT: size={chunk_size}, overlap={chunk_overlap}")
    print("="*60)

    processor = DocumentProcessor(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    # Reuse same document lists
    for pdf in pdf_files:
        if Path(pdf).exists():
            processor.load_pdf(pdf)
    for url in web_urls:
        processor.load_web(url)
    for txt in text_files:
        if Path(txt).exists():
            processor.load_text(txt)
    for url in youtube_urls:
        processor.load_youtube_video(url)
    for img in image_files:
        if Path(img).exists():
            processor.load_image(img)

    processor.chunk_documents()
    processor.print_stats()
    return processor

# Example experiments
exp_small = run_chunk_experiment(500, 100)
exp_large = run_chunk_experiment(1500, 300)



CHUNKING EXPERIMENT: size=500, overlap=100
‚úì DocumentProcessor initialized (chunk_size=500, overlap=100)
   üìÑ Loaded PDF: documents/rag_paper.pdf (19 pages)
   üìÑ Loaded PDF: documents/transformers_paper.pdf (15 pages)
   üìÑ Loaded PDF: documents/dpr_paper.pdf (13 pages)
   üåê Loaded web article: https://python.langchain.com/docs/get_started/introduction (1 document(s))
   üåê Loaded web article: https://docs.trychroma.com/usage-guide (1 document(s))
   üåê Loaded web article: https://www.promptingguide.ai/techniques/rag (1 document(s))
   üìò Loaded text file: documents/rag_textbook_chapter.txt (1 document(s))
‚ö†Ô∏è Failed to load YouTube video: https://www.youtube.com/watch?v=O5nskjZ_GoI
   Reason: HTTP Error 400: Bad Request
   üñºÔ∏è Loaded image: documents/diagram_rag_pipeline.png (1 document(s))
   üñºÔ∏è Loaded image: documents/whiteboard_notes_llm.jpg (1 document(s))
‚úì Created 468 chunks from 53 documents.

üìä Document Processing Stats:
   Total documents 