### RAG Pipelines - Data Ingestion to Vector DB Pipeline

In [10]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [11]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 8 PDF files to process

Processing: cn.pdf
  ✓ Loaded 1 pages

Processing: dsa.pdf
  ✓ Loaded 1 pages

Processing: Shreya_Sharma_Resume.pdf
  ✓ Loaded 1 pages

Processing: Shreya_Sharma_Transcript.pdf
  ✓ Loaded 1 pages

Processing: dbms.pdf
  ✓ Loaded 1 pages

Processing: ml.pdf
  ✓ Loaded 1 pages

Processing: os.pdf
  ✓ Loaded 1 pages

Processing: Siddhant_Kochhar_Resume.pdf
  ✓ Loaded 1 pages

Total documents loaded: 8


In [12]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'cn', 'source': '../data/pdf_files/cn.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'cn.pdf', 'file_type': 'pdf'}, page_content='Computer  Networks  enable  communication  between  devices  through  a  set  of  rules  called  protocols.  Among  \nthese,\n \nTCP\n \n(Transmission\n \nControl\n \nProtocol)\n \nand\n \nUDP\n \n(User\n \nDatagram\n \nProtocol)\n \nare\n \ntwo\n \nof\n \nthe\n \nmost\n \nwidely\n \nused\n \ntransport-layer\n \nprotocols\n \ndefined\n \nin\n \nthe\n \nTCP/IP\n \nmodel.\n \nTCP  is  a  connection-oriented  protocol,  meaning  a  connection  must  be  established  before  data  can  be  \nexchanged.\n \nIt\n \nensures\n \nreliable\n \nand\n \nordered\n \ndelivery\n \nthrough\n \nmechanisms\n \nlike\n \nacknowledgment\n \n(ACK),\n \nretransmission,\n \nflow\n \ncontrol,\n \nand\n \ncongestion\n \ncontrol.\n \nTCP\n \nbrea

In [18]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", "","  ","●"]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:500]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [17]:
chunks=split_documents(all_pdf_documents)
chunks

Split 8 documents into 28 chunks

Example chunk:
Content: Computer  Networks  enable  communication  between  devices  through  a  set  of  rules  called  protocols.  Among  
these,
 
TCP
 
(Transmission
 
Control
 
Protocol)
 
and
 
UDP
 
(User
 
Datagram
 ...
Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'cn', 'source': '../data/pdf_files/cn.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'cn.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'cn', 'source': '../data/pdf_files/cn.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'cn.pdf', 'file_type': 'pdf'}, page_content='Computer  Networks  enable  communication  between  devices  through  a  set  of  rules  called  protocols.  Among  \nthese,\n \nTCP\n \n(Transmission\n \nControl\n \nProtocol)\n \nand\n \nUDP\n \n(User\n \nDatagram\n \nProtocol)\n \nare\n \ntwo\n \nof\n \nthe\n \nmost\n \nwidely\n \nused\n \ntransport-layer\n \nprotocols\n \ndefined\n \nin\n \nthe\n \nTCP/IP\n \nmodel.\n \nTCP  is  a  connection-oriented  protocol,  meaning  a  connection  must  be  established  before  data  can  be  \nexchanged.\n \nIt\n \nensures\n \nreliable\n \nand\n \nordered\n \ndelivery\n \nthrough\n \nmechanisms\n \nlike\n \nacknowledgment\n \n(ACK),\n \nretransmission,\n \nflow\n \ncontrol,\n \nand\n \ncongestion\n \ncontrol.\n \nTCP\n \nbrea

### embedding And vectorStoreDB

In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
