###Data ingestion

In [1]:
from langchain_core.documents import Document

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [3]:
doc = Document(
    page_content = "This is the food data",
    metadata = {
        "source" : "food.txt",
        "pages" : 1,
        "author" : "Chef John Doe",
        "date_created" : "2026-0103"
    } 
)

In [4]:
doc

Document(metadata={'source': 'food.txt', 'pages': 1, 'author': 'Chef John Doe', 'date_created': '2026-0103'}, page_content='This is the food data')

In [5]:
##  Simple txt file

import os 
os.makedirs("../data/text_files",exist_ok=True)

In [9]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/recipe1.txt")

In [10]:
loader.load()

[Document(metadata={'source': '../data/text_files/recipe1.txt'}, page_content='Heat oil in a 3.4 L Stainless Steel SautÃ© Pan over medium heat and sear chicken pieces until golden brown. Perform this step in batches if necessary. Season and set aside on a plate.\n\nDeglaze with lemon juice, stir bottom of pan well, then add shallots, garlic, ginger and curry paste. Cook over low heat for 4 to 5 minutes.\n\nAdd zucchini and corn, continue cooking for 2-3 minutes, then stir in coconut milk, tahini, turmeric, honey, coriander and chicken.\n\nBring to the boil, then simmer over medium-low heat for 30 minutes. Adjust seasoning if necessary.\n\nServe the curry with rice, top with cucumbers and green onions, then sprinkle with sesame seeds.')]

Embedding and vectorstore DB

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
  return success


In [9]:
class EmbeddingManager:
    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print("Loading embedding model : ", self.model_name)
            self.model = SentenceTransformer(self.model_name)
        except Exception as e:
            print("error loading model")
            raise 

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not Loaded")
        
        print(f"generating embedding for {len(texts)}")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embedding {embeddings.shape}")
        return embeddings


Initialize the embedding manager

In [10]:
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model :  all-MiniLM-L6-v2


<__main__.EmbeddingManager at 0x2006ba25160>

Vector Store

In [None]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_client()

    def _initialize_client(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            self.collection = self.client.get_or_create_collection(name = self.collection_name, metadata = {"description": "PDF document embedding"}
                                                                   )
            print(f"Vector store initialized : {self.collection_name}")
            print(f"Existing document in collection : {self.collection.count()}")

        except Exception as e:
            print("Error initializing vector store")
            raise
    