### DATA INGESTION

In [2]:
## Document structure

from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="hello, this is a basic example of documents",
    metadata = {
        "source":"example.txt",
        "pages": 1,
        "date":"01-02-2026"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'date': '01-02-2026'}, page_content='hello, this is a basic example of documents')

In [4]:
## simple txt file

import os
os.makedirs("../data/text_files",exist_ok=True)

In [5]:
sample_text = {
    "../data/text_files/python_intro.txt" : ''' The Premier League is a professional association football league in England and 
    the highest level of the English football league system. Contested by 20 clubs, it operates on a system of promotion and
    relegation with the English Football League (EFL). Seasons usually run from August to May, with each team playing 38 matches: 
    two against each other team, one home and one away.[1] Most games are played on weekend afternoons, with occasional weekday evening 
    fixtures.

    The competition was founded as the FA Premier League on 20 February 1992, following the decision of clubs from the First Division
    (the top tier since 1888) to break away from the English Football League. Teams are still promoted and relegated to and from the 
    EFL Championship each season. The Premier League is a corporation managed by a chief executive, with member clubs as shareholders.
    The Premier League takes advantage of a £5 billion domestic television rights deal, with Sky and BT Group broadcasting 128 and 32 games, 
    respectively.[4][5] This will rise to £6.7 billion from 2025 to 2029.[6] In the 2022–2025 cycle, the Premier League earned a record 
    £5.6 billion from international rights.[7] As of 2023–24, Premier League clubs received central payments totalling £2.8 billion, 
    with additional solidarity payments made to relegated EFL clubs.[8]



'''

}

for filepath, content in sample_text.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("sample text created")


sample text created


In [6]:
### textloader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document = loader.load()
print(document)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content=' The Premier League is a professional association football league in England and \n    the highest level of the English football league system. Contested by 20 clubs, it operates on a system of promotion and\n    relegation with the English Football League (EFL). Seasons usually run from August to May, with each team playing 38 matches: \n    two against each other team, one home and one away.[1] Most games are played on weekend afternoons, with occasional weekday evening \n    fixtures.\n\n    The competition was founded as the FA Premier League on 20 February 1992, following the decision of clubs from the First Division\n    (the top tier since 1888) to break away from the English Football League. Teams are still promoted and relegated to and from the \n    EFL Championship each season. The Premier League is a corporation managed by a chief executive, with member clubs as shareholders.\n    The Premie

In [7]:
# Directory loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob= "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=False
)

documents = dir_loader.load()
documents


[Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content=' The Premier League is a professional association football league in England and \n    the highest level of the English football league system. Contested by 20 clubs, it operates on a system of promotion and\n    relegation with the English Football League (EFL). Seasons usually run from August to May, with each team playing 38 matches: \n    two against each other team, one home and one away.[1] Most games are played on weekend afternoons, with occasional weekday evening \n    fixtures.\n\n    The competition was founded as the FA Premier League on 20 February 1992, following the decision of clubs from the First Division\n    (the top tier since 1888) to break away from the English Football League. Teams are still promoted and relegated to and from the \n    EFL Championship each season. The Premier League is a corporation managed by a chief executive, with member clubs as shareholders.\n    The Pre

In [8]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf",
    glob= "**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress=False
)

pdf_doc  = dir_loader.load()
pdf_doc

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2026-01-30T10:53:05+05:30', 'source': '..\\data\\pdf\\project description elysian drift.pdf', 'file_path': '..\\data\\pdf\\project description elysian drift.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shiyam Purushothaman', 'subject': '', 'keywords': '', 'moddate': '2026-01-30T10:53:05+05:30', 'trapped': '', 'modDate': "D:20260130105305+05'30'", 'creationDate': "D:20260130105305+05'30'", 'page': 0}, page_content='Department of Information Technology \nProject Description \nTeam Name \n \nElysian Drift \n \nTeam Members with \nRegister Number and \nStudent Name \nSHIYAM PURUSHOTHAMAN (231001193) \nSUDARSUN P (231001218) \nDomain  \n \nHEALTHCARE \n \nTentative Project Title  \n \n                                          ALTHEA \n \nExisting System Overview Current systems are vulnerable to prescription forgery due to \ncentralized storage, with fragmented

### Chunking 


In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size =1000 , chunk_overlap =200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n","\n"," ",""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [14]:
chunks = split_documents(pdf_doc)
chunks

Split 4 documents into 7 chunks

Example chunk:
Content: Department of Information Technology 
Project Description 
Team Name 
 
Elysian Drift 
 
Team Members with 
Register Number and 
Student Name 
SHIYAM PURUSHOTHAMAN (231001193) 
SUDARSUN P (231001218) ...
Metadata: {'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2026-01-30T10:53:05+05:30', 'source': '..\\data\\pdf\\project description elysian drift.pdf', 'file_path': '..\\data\\pdf\\project description elysian drift.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shiyam Purushothaman', 'subject': '', 'keywords': '', 'moddate': '2026-01-30T10:53:05+05:30', 'trapped': '', 'modDate': "D:20260130105305+05'30'", 'creationDate': "D:20260130105305+05'30'", 'page': 0}


[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2026-01-30T10:53:05+05:30', 'source': '..\\data\\pdf\\project description elysian drift.pdf', 'file_path': '..\\data\\pdf\\project description elysian drift.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shiyam Purushothaman', 'subject': '', 'keywords': '', 'moddate': '2026-01-30T10:53:05+05:30', 'trapped': '', 'modDate': "D:20260130105305+05'30'", 'creationDate': "D:20260130105305+05'30'", 'page': 0}, page_content='Department of Information Technology \nProject Description \nTeam Name \n \nElysian Drift \n \nTeam Members with \nRegister Number and \nStudent Name \nSHIYAM PURUSHOTHAMAN (231001193) \nSUDARSUN P (231001218) \nDomain  \n \nHEALTHCARE \n \nTentative Project Title  \n \n                                          ALTHEA \n \nExisting System Overview Current systems are vulnerable to prescription forgery due to \ncentralized storage, with fragmented

## embedding and vector store db

In [9]:
import numpy as np
import chromadb
from chromadb.config import Settings
import uuid
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [10]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):

        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f'Loading embedding model: {self.model_name}')
            self.model = SentenceTransformer(self.model_name)
            print(f'Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}')
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of textx
        
        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)

        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")

        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
## initialize the Embedding Manager

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 450.36it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x24f04a06510>

### Vector store


In [11]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_doc", persist_directory: str = "./data/vector_store"):
        """
        Initialize the vector store

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store

        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize chromaDB client and collection"""

        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vecotr store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
            

        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )

            print(f"Successfully addes {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except:
            print(f"Error adding documents to vector store: {e}")
            raise



vectorstore = VectorStore()
vectorstore

Vecotr store initialized. Collection: pdf_doc
Existing documents in collection: 0


<__main__.VectorStore at 0x24f2f9846e0>

In [15]:
chunks


[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2026-01-30T10:53:05+05:30', 'source': '..\\data\\pdf\\project description elysian drift.pdf', 'file_path': '..\\data\\pdf\\project description elysian drift.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shiyam Purushothaman', 'subject': '', 'keywords': '', 'moddate': '2026-01-30T10:53:05+05:30', 'trapped': '', 'modDate': "D:20260130105305+05'30'", 'creationDate': "D:20260130105305+05'30'", 'page': 0}, page_content='Department of Information Technology \nProject Description \nTeam Name \n \nElysian Drift \n \nTeam Members with \nRegister Number and \nStudent Name \nSHIYAM PURUSHOTHAMAN (231001193) \nSUDARSUN P (231001218) \nDomain  \n \nHEALTHCARE \n \nTentative Project Title  \n \n                                          ALTHEA \n \nExisting System Overview Current systems are vulnerable to prescription forgery due to \ncentralized storage, with fragmented

In [19]:
## convert the text to embeddings 
texts = [doc.page_content for doc in chunks ]

## generate embeddings

embeddings = embedding_manager.generate_embeddings(texts)

## store in the vector db
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.60it/s]

Generated embeddings with shape: (7, 384)
Adding 7 documents to vector store...
Successfully addes 7 documents to vector store
Total documents in collection: 7



