### RAG Pipelines - Data Ingestion to Vector DB Pipelines

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [4]:
### Read all pdf's inside the directory


def process_all_pdfs(pdf_directory):
    
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdfs recursivly
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            # add source information to metadata
            for doc in documents:
                doc.metadata["source_file"] = str(pdf_file)
                doc.metadata["file_type"] = "pdf"

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("data")        

Found 2 PDF files to process.

Processing: 61076471_1.pdf
Loaded 17 pages

Processing: 61076471_2.pdf
Loaded 2 pages

Total documents loaded: 19


In [5]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-04-12T18:39:30+05:30', 'source': 'data\\pdf\\61076471_1.pdf', 'file_path': 'data\\pdf\\61076471_1.pdf', 'total_pages': 17, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-12T18:39:30+05:30', 'trapped': '', 'modDate': "D:20240412183930+05'30'", 'creationDate': "D:20240412183930+05'30'", 'page': 0, 'source_file': 'data\\pdf\\61076471_1.pdf', 'file_type': 'pdf'}, page_content='___________________________________________________________________________ \n \nLTIMindtree Limited is a subsidiary of Larsen & Toubro Limited \n \n©LTIMindtree | Confidential 2024 \nSpecialist - Service \nSpecialist - Service Design \nSeptember 12, 2022 \nCustomer Success Team (CST) BU \nDATE: April 12, 2024 \nRef: LTIMindtree/HR/EDGE/2024 \n \nEmployee Name : Sonu . \nPS Number          : 61076471 \nCountry                : In

In [10]:
### Text Slitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")  # print first 200 characters of the first chunk
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs    

In [7]:
chunks=split_documents(all_pdf_documents)
chunks

Split 19 documents into 68 chunks

Example chunk:
Content: ___________________________________________________________________________ 
 
LTIMindtree Limited is a subsidiary of Larsen & Toubro Limited 
 
©LTIMindtree | Confidential 2024 
Specialist - Service ...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-04-12T18:39:30+05:30', 'source': 'data\\pdf\\61076471_1.pdf', 'file_path': 'data\\pdf\\61076471_1.pdf', 'total_pages': 17, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-12T18:39:30+05:30', 'trapped': '', 'modDate': "D:20240412183930+05'30'", 'creationDate': "D:20240412183930+05'30'", 'page': 0, 'source_file': 'data\\pdf\\61076471_1.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-04-12T18:39:30+05:30', 'source': 'data\\pdf\\61076471_1.pdf', 'file_path': 'data\\pdf\\61076471_1.pdf', 'total_pages': 17, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-12T18:39:30+05:30', 'trapped': '', 'modDate': "D:20240412183930+05'30'", 'creationDate': "D:20240412183930+05'30'", 'page': 0, 'source_file': 'data\\pdf\\61076471_1.pdf', 'file_type': 'pdf'}, page_content='___________________________________________________________________________ \n \nLTIMindtree Limited is a subsidiary of Larsen & Toubro Limited \n \n©LTIMindtree | Confidential 2024 \nSpecialist - Service \nSpecialist - Service Design \nSeptember 12, 2022 \nCustomer Success Team (CST) BU \nDATE: April 12, 2024 \nRef: LTIMindtree/HR/EDGE/2024 \n \nEmployee Name : Sonu . \nPS Number          : 61076471 \nCountry                : In

### Embedding and VectorStoreDB

In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class EmbeddingManager:
    """ Handles document embedding generating using sentence transformers. """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the embedding manager with the specified model.
        Args:
            model_name: Hugging Face model name for the sentence transformer.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """ Loads the sentence transformer model. """
        try:
            print(f"Loaded embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            # print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e    
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generates embeddings for a list of texts.
        Args:
            texts: List of strings to embed. 
        Returns:
            A numpy array of embedding with shape (len(texts), embedding_dimension).
        """          

        if not self.model:
            raise ValueError("Model not loaded. Cannot generate embeddings.")
        try:
            print(f"Generating embeddings for {len(texts)} texts...")
            embeddings = self.model.encode(texts, show_progress_bar=True)
            print(f"Generated embeddings with shape: {embeddings.shape}")
            return embeddings
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            raise e
        
    # def get_sentence_embedding_dimension(self) -> int:
    #     """ Returns the dimension of the sentence embeddings. """
    #     if not self.model:
    #         raise ValueError("Model not loaded. Cannot get embedding dimension.")
    #     return self.model.get_sentence_embedding_dimension()

### Initialize the embedding manager
embedding_manager = EmbeddingManager()   
embedding_manager


Loaded embedding model: all-MiniLM-L6-v2


'[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:997)' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.
'[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:997)' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/adapter_config.json
Retrying in 1s [Retry 1/5].


Error loading model all-MiniLM-L6-v2: Cannot send a request, as the client has been closed.


RuntimeError: Cannot send a request, as the client has been closed.