# Data ingestion -> chunking 

In [21]:
import os 
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

### PDF doc cleaning and formating before chunking and during ingestion :

In [22]:
import re

def clean_pdf_text(text):
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)

    # Replace single newline inside paragraphs with space
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Remove standalone page numbers
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)

    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [23]:
### read all the pdf in the dir 

def process_all_pdfs(pdf_directory):
    """Process all PDF in a Directory"""
    all_Documents=[]
    pdf_dir = Path(pdf_directory)
    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")
    # a pdf file will contain many documents in it;
    for pdf_file in pdf_files:
        print(f"\nProcessing:{pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents=loader.load()

            for doc in documents:
                doc.page_content = clean_pdf_text(doc.page_content)
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type']='pdf'
            all_Documents.extend(documents)
            print(f"loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error {e}")
    print(f"\nTotal documents loaded : {len(all_Documents)}")
    return all_Documents
print(process_all_pdfs.__doc__)
docs = process_all_pdfs("../data")


Process all PDF in a Directory
Found 2 PDF files to process

Processing:MIT-UG Academic HandBook 2025 -2026 - Revised.pdf
loaded 409 pages

Processing:OSDL manual.pdf
loaded 59 pages

Total documents loaded : 468


In [24]:
print(docs[0].page_content[:800])

(A constituent unit of MAHE, Manipal) MANIPAL INSTITUTE OF TECHNOLOGY MANIPAL 2025-26 B.Tech. 2025 - 26 Academic Programme Hand book


# CHUNKING 

In [None]:
## chunking : splitting file to get smaller chunk for better data retrival
def chunk_docs(documents,chunk_size=750,chunk_overlap =150):
    """Split documents imto smaller chunks """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n","\n"," ",""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    #show example of a chunk
    if split_docs:
        print(f"\nExample Chunk:")
        print(f"Content :{split_docs[0].page_content[:200]}...")
        print(f"metadata:{split_docs[0].metadata}")
    return split_docs


In [30]:
chunks=chunk_docs(docs)

Split 468 documents into 3397 chunks

Example Chunk
Content :(A constituent unit of MAHE, Manipal) MANIPAL INSTITUTE OF TECHNOLOGY MANIPAL 2025-26 B.Tech. 2025 - 26 Academic Programme Hand book...
metadata:{'producer': 'Corel PDF Engine Version 23.5.0.506', 'creator': 'CorelDRAW 2021', 'creationdate': '2025-07-08T11:23:57+05:30', 'source': '..\\data\\pdf_files\\MIT-UG Academic HandBook 2025 -2026 - Revised.pdf', 'file_path': '..\\data\\pdf_files\\MIT-UG Academic HandBook 2025 -2026 - Revised.pdf', 'total_pages': 409, 'format': 'PDF 1.7', 'title': '', 'author': 'Harish Shetty', 'subject': '', 'keywords': '', 'moddate': '2025-07-09T11:29:33+05:30', 'trapped': '', 'modDate': "D:20250709112933+05'30'", 'creationDate': "D:20250708112357+05'30'", 'page': 0, 'source_file': 'MIT-UG Academic HandBook 2025 -2026 - Revised.pdf', 'file_type': 'pdf'}


## Embedding and vector DB

In [31]:
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Tuple,Dict,Any
from sklearn.metrics.pairwise import cosine_similarity



# Embeddings, Frameworks & Python OOP (RAG Context)

---

## Libraries Used 

### numpy 
- Used for numerical computations. 
- Handles vectors and matrices. 
- Embeddings are numerical vectors → NumPy helps manipulate them. 

---

### sentence_transformers 
- Provides pretrained embedding models. 
- Converts text → fixed-size dense vector. 
- Example model: `"all-MiniLM-L6-v2"` 
- Embedding dimension: **384** 

Example: 
```python
model = SentenceTransformer("all-MiniLM-L6-v2")
vector = model.encode("Hello world")


In [37]:
class EmbeddingManager:
    def __init__(self,model_name:str='all-MiniLM-L6-v2'):
        #DOCSTRINGS
        """
        Initialize the Embedding manager
        Args:
            model_name: HuggingFace model name for sentence embeddings
       
        """
        self.model_name=model_name
        self.model=None
        self._load_model()#it is going to load this model using a protected fuc
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding Model {self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Loaded sucessfully .Embeding Dimention :{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading Model{self.model_name}:{e}")
            raise
    def generate_embeddings(self,texts:List[str])->np.ndarray:
        """
        Generate embeddings for a list of texts
        Args:
            texts:List of text Strings to embed
        Returns:
            numpy array of embeddings with shape (len(texts)),embedding_dim)
        """
        if self.model is None:
            raise ValueError("Model Not loaded")
        print(f"Generating embedding for{len(texts)} texts..")
        embeddings= self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with Shape:{embeddings.shape}")
        return embeddings
    def get_embedding_dimension(self)-> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not Loaded")
        return self.model.get_sentence_embedding_dimension()

###initiaize the embeddingmanager 

Embedding_manager=EmbeddingManager()
EmbeddingManager
    

    

Loading embedding Model all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 609.52it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loaded sucessfully .Embeding Dimention :384


__main__.EmbeddingManager

## EmbeddingManager Class – Explanation
---
## Purpose

- Wrapper around a HuggingFace SentenceTransformer model.
- Loads embedding model once.
- Stores it inside object for reuse.
- Designed for clean and scalable ML architecture.

---

## Class Definition

```python
class EmbeddingManager:
