### RAG Pipeline - from data ingestion to Vector DB

In [5]:
import os
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [10]:
### Read all the pdfs inside the directory

def process_all_pdfs(pdf_directory):
  all_docs=[]
  pdf_dir=Path(pdf_directory)
  
  #find all pdf files recursively
  pdf_files=list(pdf_dir.glob("**/*.pdf"))
  print(f"found {len(pdf_files)} PDF files to process")
  
  for pdf in pdf_files:
    print(f"\nprocessing: {pdf.name}")
    try:
      loader=PyMuPDFLoader(str(pdf))
      documents=loader.load()
      
      # add source info to metadata
      for doc in documents:
        doc.metadata['source_file']=pdf.name
        doc.metadata['file_type']='pdf'
        
      all_docs.extend(documents)
      print(f"‚òëÔ∏è loaded {len(documents)} pages.")
    except Exception as e:
      print(f"‚ùå Error : {e}")
  print(f"\n Total docs loaded : {len(all_docs)}")
  return all_docs

all_pdfs=process_all_pdfs("../data")
        



found 4 PDF files to process

processing: embeddings.pdf
‚òëÔ∏è loaded 27 pages.

processing: NNDT.pdf
‚òëÔ∏è loaded 8 pages.

processing: attention.pdf
‚òëÔ∏è loaded 11 pages.

processing: objectdetection.pdf
‚òëÔ∏è loaded 21 pages.

 Total docs loaded : 67


In [11]:
all_pdfs

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-24T11:50:50-07:00', 'source': '../data/pdf/embeddings.pdf', 'file_path': '../data/pdf/embeddings.pdf', 'total_pages': 27, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-24T11:50:50-07:00', 'trapped': '', 'modDate': "D:20250824115050-07'00'", 'creationDate': "D:20250824115050-07'00'", 'page': 0, 'source_file': 'embeddings.pdf', 'file_type': 'pdf'}, page_content='Speech and Language Processing.\nDaniel Jurafsky & James H. Martin.\nCopyright ¬© 2025.\nAll\nrights reserved.\nDraft of August 24, 2025.\nCHAPTER\n5\nEmbeddings\nËçÉËÄÖÊâÄ‰ª•Âú®È±ºÔºåÂæóÈ±ºËÄåÂøòËçÉNets are for Ô¨Åsh;\nOnce you get the Ô¨Åsh, you can forget the net.\nË®ÄËÄÖÊâÄ‰ª•Âú®ÊÑèÔºåÂæóÊÑèËÄåÂøòË®ÄWords are for meaning;\nOnce you get the meaning, you can forget the words\nÂ∫ÑÂ≠ê(Zhuangzi), Chapter 26\nThe asphalt that Los Angeles is famous for occurs mainly on its 

In [None]:
# Text splitting into chunks

def split_documents(doc,chunk_size=1000,chunk_overlap=200):
  """Split Documents into smaller chunks for better RAG performance"""
  text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n","\n"," ",""]
  )
  splitted_docs=text_splitter.split_documents(doc)
  print(f"Splitted {len(doc)} documents into {len(splitted_docs)} chunks.")
  
  #show example of a chunk
  if splitted_docs:
    print(f"\n Example chunk:")
    print(f"Content: {splitted_docs[0].page_content[:200]}")
    print(f"Metadata: {splitted_docs[0].metadata}")
  return splitted_docs




In [14]:
chunks=split_documents(all_pdfs)

Splitted 67 into 379 chunks.

 Example chunk:
Content: Speech and Language Processing.
Daniel Jurafsky & James H. Martin.
Copyright ¬© 2025.
All
rights reserved.
Draft of August 24, 2025.
CHAPTER
5
Embeddings
ËçÉËÄÖÊâÄ‰ª•Âú®È±ºÔºåÂæóÈ±ºËÄåÂøòËçÉNets are for Ô¨Åsh;
Once you get the 
Metadata: {'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-24T11:50:50-07:00', 'source': '../data/pdf/embeddings.pdf', 'file_path': '../data/pdf/embeddings.pdf', 'total_pages': 27, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-24T11:50:50-07:00', 'trapped': '', 'modDate': "D:20250824115050-07'00'", 'creationDate': "D:20250824115050-07'00'", 'page': 0, 'source_file': 'embeddings.pdf', 'file_type': 'pdf'}
