In [2]:
from langchain_core.documents import Document

In [3]:
doc=Document(
    page_content="this is a main text context for RAG",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Rajat Machra",
        "date_created":"2025-11-11"
    }
)
doc

Document(metadata={'source': 'example.txt', 'page': 1, 'author': 'Rajat Machra', 'date_created': '2025-11-11'}, page_content='this is a main text context for RAG')

In [4]:
### create a txt file
import os
os.makedirs("../data/text_files",exist_ok=True)

In [5]:
sample_texts={
    "../data/text_files/hadoop.txt":
    """ When I first joined Orkut, I was happy. With Orkut, I had a new platform enabling me get 
to know the people around me, including their thoughts, their views, their purchases, 
and the places they visited. We were all gaining more knowledge than ever before and 
felt more connected to the people around us. Uploading pictures helped us share good 
ideas of places to visit. I was becoming more and more addicted to understanding 
and expressing sentiments. After a few years, I joined Facebook. And day by day, I was 
introduced to what became an infinite amount of information from all over world. Next, 
I started purchasing items online, and I liked it more than shopping offline. I could easily 
get a lot of information about products, and I could compare prices and features. And I 
wasnt the only one; millions of people were feeling the same way about the Web.
 More and more data was flooding in from every corner of the world to the Web. And 
thanks to all those inventions relate"""
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("sample text files created")


sample text files created


In [6]:
###text loader

from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/hadoop.txt",encoding="utf-8")
document=loader.load()
print(document)


[Document(metadata={'source': '../data/text_files/hadoop.txt'}, page_content=' When I first joined Orkut, I was happy. With Orkut, I had a new platform enabling me get \nto know the people around me, including their thoughts, their views, their purchases, \nand the places they visited. We were all gaining more knowledge than ever before and \nfelt more connected to the people around us. Uploading pictures helped us share good \nideas of places to visit. I was becoming more and more addicted to understanding \nand expressing sentiments. After a few years, I joined Facebook. And day by day, I was \nintroduced to what became an infinite amount of information from all over world. Next, \nI started purchasing items online, and I liked it more than shopping offline. I could easily \nget a lot of information about products, and I could compare prices and features. And I \nwasnt the only one; millions of people were feeling the same way about the Web.\n More and more data was flooding in from 

In [7]:
###directory loader for text files
from langchain_community.document_loaders import DirectoryLoader

dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",###To match files
    loader_cls=TextLoader,
    loader_kwargs={"encoding":'utf-8'},
    show_progress=False

)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\hadoop.txt'}, page_content=' When I first joined Orkut, I was happy. With Orkut, I had a new platform enabling me get \nto know the people around me, including their thoughts, their views, their purchases, \nand the places they visited. We were all gaining more knowledge than ever before and \nfelt more connected to the people around us. Uploading pictures helped us share good \nideas of places to visit. I was becoming more and more addicted to understanding \nand expressing sentiments. After a few years, I joined Facebook. And day by day, I was \nintroduced to what became an infinite amount of information from all over world. Next, \nI started purchasing items online, and I liked it more than shopping offline. I could easily \nget a lot of information about products, and I could compare prices and features. And I \nwasnt the only one; millions of people were feeling the same way about the Web.\n More and more data was flooding in fr

In [13]:
###directory loader for pdf

from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",###To match files
    loader_cls=PyMuPDFLoader,
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', 'author': 'artemenovs', 'subject': '', 'keywords': '', 'moddate': '2020-01-22T18:20:50+03:00', 'trapped': '', 'modDate': "D:20200122182050+03'00'", 'creationDate': "D:20200122182050+03'00'", 'page': 0}, page_content=''),
 Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', '

code is till here 
later delete the below part
.
.
.
.
.
.
.
.
.
.
.
.
v

.

.

.

In [24]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer ###embiding by hugging_face
import chromadb
from chromadb.config import Settings
import uuid ### for vectordb id 
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
class EmbeddingManager:

    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """ 
        Initialize the embading manager
        """
        self.model_name=model_name
        self.model =None
        self._load_model()

    def _load_model(self):
        try:
            print (f"loading embedding model:{self.model_name}")
            self.model= SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully.\nembedding diamension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model{self.model_name}:{e}")
            raise

    def generate_embeddings(self,texts:List[str])->np.ndarray:
        
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embedding for {len(texts)} texts...")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape:{embeddings.shape}")
        return embeddings
    


### intialize embedding manager
embedding_manager=EmbeddingManager()
embedding_manager

loading embedding model:all-MiniLM-L6-v2
Model Loaded successfully.
embedding diamension:384


<__main__.EmbeddingManager at 0x27f0c8baad0>

In [26]:
###vector storage
class VectorStore:

    def __init__(self,collection_name:str="pdf_documents",persist_directory: str="../data/vector_store"):
        self.collection_name=collection_name 
        self.persist_directory=persist_directory
        self.client =None
        self.collection=None
        self._initialize_store()
        
    def _initialize_store(self):

        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)

            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embedding for RAG"}
            )
            print(f"vector store initialized.\nCollection:{self.collection_name}")
            print(f"existing document in colletion:{self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store:{e}")
            raise

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("number of documents must match number of embeddings")
        
        print(f"adding{len(documents)}document to vector store...")

        ###prepare data for chromadb
        ids=[]
        metadatas=[]
        documents_text=[]
        embedding_list=[]

        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            ### prepare meta data
            metadata = dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']=len(doc.page_content)
            metadatas.append(metadata)

            ###document content
            documents_text.append(doc.page_content)

            ###embedding
            embedding_list.append(embedding.tolist())

        ###add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text
            )

        except Exception as e:
            print(f"error adding documents to vector store:{e}")
            raise
vectorstore=VectorStore()
vectorstore


vector store initialized.
Collection:pdf_documents
existing document in colletion:0


<__main__.VectorStore at 0x27f0f6865d0>