###Data ingestion

In [18]:
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content = "This is the food data",
    metadata = {
        "source" : "food.txt",
        "pages" : 1,
        "author" : "Chef John Doe",
        "date_created" : "2026-0103"
    } 
)

In [3]:
doc

Document(metadata={'source': 'food.txt', 'pages': 1, 'author': 'Chef John Doe', 'date_created': '2026-0103'}, page_content='This is the food data')

In [11]:
##  Simple txt file

import os 
os.makedirs("../data/text_files",exist_ok=True)

In [2]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/recipe1.txt")

In [10]:
loader.load()

[Document(metadata={'source': '../data/text_files/recipe1.txt'}, page_content='Heat oil in a 3.4 L Stainless Steel SautÃ© Pan over medium heat and sear chicken pieces until golden brown. Perform this step in batches if necessary. Season and set aside on a plate.\n\nDeglaze with lemon juice, stir bottom of pan well, then add shallots, garlic, ginger and curry paste. Cook over low heat for 4 to 5 minutes.\n\nAdd zucchini and corn, continue cooking for 2-3 minutes, then stir in coconut milk, tahini, turmeric, honey, coriander and chicken.\n\nBring to the boil, then simmer over medium-low heat for 30 minutes. Adjust seasoning if necessary.\n\nServe the curry with rice, top with cucumbers and green onions, then sprinkle with sesame seeds.')]

In [4]:
# load pdf file 

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader

pdf_loader = DirectoryLoader("../data/pdf",
                           glob = "**/*.pdf",
                           loader_cls = PyMuPDFLoader,
                           show_progress = False)


pdf_documents = pdf_loader.load()

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [5]:
pdf_documents

[Document(page_content='1\n', metadata={'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 0, 'total_pages': 178, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 14.0 (Windows)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20200213105254+05'30'", 'modDate': "D:20200213105327+05'30'", 'trapped': ''}),
 Document(page_content='', metadata={'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 1, 'total_pages': 178, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 14.0 (Windows)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20200213105254+05'30'", 'modDate': "D:20200213105327+05'30'", 'trapped': ''}),
 Document(page_content='', metadata={'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 2, 'total_pages': 1

Embedding and vectorstore DB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class EmbeddingManager:
    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print("Loading embedding model : ", self.model_name)
            self.model = SentenceTransformer(self.model_name)
        except Exception as e:
            print("error loading model")
            raise 

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not Loaded")
        
        print(f"generating embedding for {len(texts)}")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embedding {embeddings.shape}")
        return embeddings


Initialize the embedding manager

In [8]:
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model :  all-MiniLM-L6-v2


<__main__.EmbeddingManager at 0x20b7f559be0>

Vector Store

In [31]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_client()

    def _initialize_client(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.Client(Settings(persist_directory = self.persist_directory,
                                                   anonymized_telemetry=False))

            self.collection = self.client.get_or_create_collection(name = self.collection_name, metadata = {"description": "PDF document embedding"}
                                                                   )
            print(f"Vector store initialized : {self.collection_name}")
            print(f"Existing document in collection : {self.collection.count()}")

        except Exception as e:
            print("Error initializing vector store")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Length of documents should match")
        
        ids = []
        metadatas = []
        documents_texts = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_texts.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids = ids,
                metadatas = metadatas,
                documents = documents_texts,
                embeddings = embeddings_list
            )
            print(f"Added {len(documents)} documents to vector store")



        except Exception as e:
            print(f"Error adding documents to vector store at index {i}")
            raise
    

In [16]:
vector_store = VectorStore()
vector_store

Failed to send telemetry event client_start: capture() takes 1 positional argument but 3 were given
Using embedded DuckDB without persistence: data will be transient
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


Vector store initialized : pdf_documents
Existing document in collection : 0


<__main__.VectorStore at 0x20b7f63a660>

In [21]:
chunks = split_documents(pdf_documents)

Split 178 documents into 234 chunks

Example chunk:
Content: 1...
Metadata: {'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 0, 'total_pages': 178, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 14.0 (Windows)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20200213105254+05'30'", 'modDate': "D:20200213105327+05'30'", 'trapped': ''}


In [24]:
chunks

[Document(page_content='1', metadata={'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 0, 'total_pages': 178, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 14.0 (Windows)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20200213105254+05'30'", 'modDate': "D:20200213105327+05'30'", 'trapped': ''}),
 Document(page_content='Garden to Kitchen\nNutrient Rich Recipes from Home Garden Produce \nVigyan Prasar\nAn Autonomous Organisation of the \nDepartment of Science & Technology, Govt. of India', metadata={'source': '..\\data\\pdf\\Recipes Book.pdf', 'file_path': '..\\data\\pdf\\Recipes Book.pdf', 'page': 4, 'total_pages': 178, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 14.0 (Windows)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20200213105254+05'30'", 'modDate': "D:20200213105327+05'30'", 'trappe

In [22]:
texts = [doc.page_content for doc in chunks]

In [23]:
texts

['1',
 'Garden to Kitchen\nNutrient Rich Recipes from Home Garden Produce \nVigyan Prasar\nAn Autonomous Organisation of the \nDepartment of Science & Technology, Govt. of India',
 'Garden to Kitchen\nNutrient Rich Recipes from Home Garden Produce \nRekha Sinha \nHead and Chief Scientist\nDepartment of Home Science\nBirsa Agricultural University, Ranchi\n  \n&\nKinkini Dasgupta Misra\nScientist F \nVigyan Prasar',
 'Published By\nVigyan Prasar (An autonomous organization of the Department of Science & Technology, Government of India), \nA 50, Institutional Area, Sector - 62, Noida - 201309, Uttar Pradesh, India\n(Regd. Office: Technology Bhawan, New Delhi 110016)\nPhones: 0120-2404430-35,  Fax: 91-120-2404437\nE-mail: info@vigyanprasar.gov.in\nWebsite: https://www.vigyanprasar.gov.in\nCopyright: © 2019 by Vigyan Prasar, All rights reserved\nConceptualization: Nakul Parashar, Director, Vigyan Prasar\nContent Development: Rekha Sinha & Kinkini Dasgupta Misra\nTechnical Experts: \nRekha S

In [25]:
#  generate the embeddings

embeddings = embedding_manager.generate_embeddings(texts)

generating embedding for 234


Batches: 100%|██████████| 8/8 [00:01<00:00,  4.93it/s]

Generated embedding (234, 384)





In [26]:
embeddings

array([[-0.04356467,  0.00185986, -0.05116582, ...,  0.0684123 ,
         0.00977213, -0.00619245],
       [-0.05173695,  0.00575788, -0.07455291, ..., -0.00889213,
         0.016043  , -0.03968304],
       [-0.03676435,  0.01526926, -0.06243809, ...,  0.05827042,
        -0.01137348, -0.05676283],
       ...,
       [ 0.00555939,  0.07568446, -0.12203792, ..., -0.05620275,
        -0.09680133, -0.01586703],
       [-0.03108643,  0.02920225, -0.10655692, ..., -0.11562077,
        -0.10493186, -0.04379419],
       [ 0.02035526,  0.06370171, -0.10241883, ..., -0.05219057,
        -0.06658911, -0.02799126]], dtype=float32)

In [32]:
# store in VectorDB

vector_store.add_documents(chunks,embeddings)

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'