## Getting started 


In [None]:
### make document loader

In [None]:
from langchain_community.document_loaders import PyPDFLoader, csv_loader, json_loader, TextLoader, UnstructuredExcelLoader, UnstructuredWordDocumentLoader, UnstructuredEmailLoader
import os

def document_loader(directory_path: None) -> list[str]:
    """Load documents from a directory using appropriate loaders based on file type."""
    if directory_path is None:
        raise ValueError("directory_path must be provided")

    loaders = {
        '.pdf': PyPDFLoader,
        '.csv': csv_loader.CSVLoader,
        '.json': json_loader.JSONLoader,
        '.txt': TextLoader,
        '.xlsx': UnstructuredExcelLoader,
        '.xls': UnstructuredExcelLoader,
        '.docx': UnstructuredWordDocumentLoader,
        '.eml': UnstructuredEmailLoader,
        
    }

    documents = []

    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()
            loader_class = loaders.get(ext)
            if loader_class:
                print(f"Loading file: {file_path} with loader: {loader_class.__name__}")
                loader = loader_class(file_path)
                documents.extend(loader.load_and_split())
                documents.extend(loader.load())
            else:
                print(f"Unsupported file type: {file_path}")

    return documents

loaded_documents = document_loader("./rag_data")
loaded_documents

Loading file: ./rag_data/Release_2649.docx with loader: UnstructuredWordDocumentLoader
Loading file: ./rag_data/Marrapu_Ramlokesh_resume.pdf with loader: PyPDFLoader


[Document(metadata={'source': './rag_data/Release_2649.docx'}, page_content='Date of version: 14.05.2025 Created by: Marrapu.Ramlokesh, Siddharth2.S Confidentiality level: Internal use\n\nChange history\n\nDate Version Created by Description of change DD.MM.YYYY\n\nIntroduction\n\nBusiness/Use Case Requirement Brief\n\nTo identify AirFiber and Fiber\xa0customers at risk of churning and enriching the output with additional actionable insights (indicators of problems each customer might have faced). The aim is to enable proactive measures to retain customers and reduce churn.\n\nBusiness Description of Change\n\nWe are developing a model to predict customer churn probability using multiple data points, including payment history, complaints, outages, and network issues. Customers will be sorted on every billing date based on their likelihood of churn (i.e., probability of churning). This segmentation will allow stakeholders to prioritize interventions and retention strategies.\n\nBusiness

In [3]:
## Embedding maker

In [2]:
loaded_documents[0].page_content

'Datasets available Utility Comments National Family Health Survey (NFHS): National Family Health Survey - 4 : District Yes National Family Health Survey-5 : District Yes National Family Health Survey - 4 & 5: State Yes National Sample Survey Organisation (NSSO): National Sample Survey (NSS) Round 75 -Social Consumption - Health Yes Could we access logitudinal data ? Medical expenditure for treatment per spell Yes Medical and Non-Medical Expenditure Per Hositalised ChildBirth Yes Average Expenditure for Treatment Yes Percentage distribution of persons by coverage of scheme of health expenditure support Yes Average medical expenditure and non-medical expenditure by gender Yes The similar datasets can be found for NSS Round 71 as well. Yes Would be useful to build the longitudinal profile for all the parameters above Rural Health Statistics (RHS): State/UT Wise Number of Sub-Centres, Primary Health Centres (PHC), Community Health Centres (CHC) & Health and Wellness Centres (HWC) Function

In [41]:
from typing import Any, Optional
import numpy as np
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_BASE"] = os.getenv("OPENAI_API_BASE")

# print("KEY:", os.getenv("OPENAI_API_KEY"))
# print("BASE:", os.getenv("OPENAI_API_BASE"))

class EmbeddingMaker:
    """A class to create embeddings for documents."""
    def __init__(
            self,
            backend: str = "sentence_transformers",
            model_name: str = "all-MiniLM-L12-v2", 
            chunk_size: int = 512, 
            chunk_overlap: int = 50,
            batch_size: int = 16,
            openai_key: Optional[str] = None,
            openai_model: Optional[str] = None 
        ):
        self.backend = backend
        self.embed_model = model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.batch_size = batch_size
        self.openai_key = openai_key
        self.openai_model = openai_model
        if backend == "sentence_transformers":
            self.embed_model = SentenceTransformer(model_name)
    

    def chunk_documents(self, documents: list[Any]) -> list[Any]:
        """Chunk documents into smaller pieces."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        chunks = text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks from {len(documents)} documents.")
        return chunks
    
    def get_embeddings(self, chunks: list[Any]) -> Any:
        """Get the embedding model based on the specified backend."""
        if self.backend == "sentence_transformers":
            return self._get_sentence_transformers_embeddings(chunks)
        elif self.backend == "openai":
            return self._get_openai_embeddings(chunks)
        else:
            raise ValueError(f"Unsupported embedding backend: {self.backend}")
        
    def _get_sentence_transformers_embeddings(self, chunks: list[Any]) -> np.ndarray:
        """Get embeddings using Sentence Transformers."""
        texts= [chunk.page_content for chunk in chunks]
        print(f"Generating embeddings for {len(texts)} chunks using Sentence Transformers.")
        embeddings = self.embed_model.encode(texts, show_progress_bar=True)
        return embeddings
    
    def _get_openai_embeddings(self, chunks: list[Any]) -> np.ndarray:
        """Get embeddings using OpenAI."""
        embedder = OpenAIEmbeddings(
            model=self.openai_model
        )
        texts= [chunk.page_content for chunk in chunks]
        print(f"Generating embeddings for {len(texts)} chunks using OpenAI.")
        all_embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i : i + self.batch_size]
            embeddings = embedder.embed_documents(batch)
            all_embeddings.append(embeddings) 
        return np.vstack(all_embeddings)



In [42]:
EmbedMaker = EmbeddingMaker()

chunked_documents = EmbedMaker.chunk_documents(loaded_documents)
embeddings = EmbedMaker.get_embeddings(chunked_documents)
embeddings

Created 154 chunks from 12 documents.
Generating embeddings for 154 chunks using Sentence Transformers.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

array([[-0.0256778 , -0.0203816 , -0.02863   , ..., -0.00626529,
        -0.01355996,  0.10974622],
       [ 0.00353573, -0.03120735, -0.02974517, ...,  0.04245427,
         0.02439736,  0.02422303],
       [-0.03240658,  0.04424246,  0.01247747, ...,  0.04112267,
         0.06839233,  0.01654245],
       ...,
       [-0.05692467,  0.0077112 , -0.00643335, ...,  0.06484738,
        -0.02324439, -0.04590766],
       [-0.06537127, -0.05666744,  0.00703504, ..., -0.07030727,
        -0.02239653,  0.003152  ],
       [ 0.07542328, -0.03661927,  0.06386682, ..., -0.02732012,
         0.01738566, -0.05577966]], shape=(154, 384), dtype=float32)

## create Vectorstore

In [45]:
import faiss
import os
import pickle
import numpy as np
from typing import Any, Optional
from sentence_transformers import SentenceTransformer


class vectorstore_creator:
    def __init__(
            self,
            persistent_directory: str = "./faiss_vectorstore",
            embed_model: str = "all-MiniLM-L12-v2",
            chunk_size: int = 512,
            chunk_overlap: int = 50,
            batch_size: int = 16):
        self.persistent_directory = persistent_directory
        os.makedirs(self.persistent_directory, exist_ok=True)
        self.embed_model = embed_model
        self.metadata = []
        self.chunk_size = chunk_size
        self.index_type: Optional[str] = "HNSW"
        self.chunk_overlap = chunk_overlap
        self.batch_size = batch_size
        self.model = SentenceTransformer(embed_model)
        self.index = None
        self.dimension = None
        
        self.embeddings = embeddings

    def from_documents(self, documents: list[Any]):
        """Create a FAISS index from documents and embed_model."""
        embedder = EmbeddingMaker()
        chunked_documents = embedder.chunk_documents(documents)
        embeddings = embedder.get_embeddings(chunked_documents)
        metadata = [{"text" : chunk.page_content} for chunk in chunked_documents]
        self.add_embeddings(np.array(embeddings).astype(np.float32) , metadata)   
        self.save()

    def add_embeddings(self,embeddings: np.ndarray, metadata: list[Any]):
        """Add embeddings to the FAISS index."""
        if self.index is None:
            self.dimension = embeddings.shape[1]
            print("Dimension of embeddings:", self.dimension)
            self._initialize_index(self.dimension)
        start = len(self.metadata)
        ids = np.arange(start, start + len(embeddings)).astype("int64")
        self.index.add_with_ids(embeddings, ids)
        if metadata:
            self.metadata.extend(metadata)
        print(f"Added {len(embeddings)} embeddings to the index.")
    
    def _initialize_index(self, dimension: int):
        if self.index_type == "HNSW":
            self.internal_index = faiss.IndexHNSWFlat(self.dimension, 32)
            self.internal_index.hnsw.efConstruction = 40
            self.internal_index.hnsw.efSearch = 16
        else:
            self.internal_index = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIDMap(self.internal_index)
        
    def save(self):
        faiss_path = os.path.join(self.persistent_directory, "faiss.index")
        meta_path = os.path.join(self.persistent_directory, "metadata.pkl")
        faiss.write_index(self.index, faiss_path)
        with open(meta_path, "wb") as f:
            pickle.dump(self.metadata, f)
        print(f"[INFO] Saved Faiss index and metadata to {self.persistent_directory}")

    def load(self):
        faiss_path = os.path.join(self.persistent_directory, "faiss.index")
        meta_path = os.path.join(self.persistent_directory, "metadata.pkl")
        self.index = faiss.read_index(faiss_path)
        with open(meta_path, "rb") as f:
            self.metadata = pickle.load(f)
        print(f"[INFO] Loaded Faiss index and metadata from {self.persistent_directory}")
    
    def search(self, query_embedding: np.ndarray, top_k: int = 5):
        """Search the FAISS index for similar embeddings."""
        distances, indices = self.index.search(query_embedding, top_k)
        results = []
        for idx, dist in zip(indices[0], distances[0]):
            metadata = self.metadata[idx] if idx < len(self.metadata) else None
            results.append({"index": idx, "distance": dist,"metadata": metadata})
        return results
        
    def query(self,query: str, top_k: int=5):
        """Get embedding fro the query and search the query"""
        query_embedding = self.model.encode([query]).astype(np.float32)
        results = self.search(query_embedding, top_k)
        return results
        
if __name__ == "__main__":
    vector_store = vectorstore_creator("./faiss_vectorstore")
    metadata = [{"text": doc.page_content} for doc in loaded_documents]
    vector_store.add_embeddings(embeddings, metadata)
    vector_store.save()
    vector_store.load()
    print(vector_store.query("Real-time CCTV-based PPE Detection",top_k=3))



    

Dimension of embeddings: 384
Added 154 embeddings to the index.
[INFO] Saved Faiss index and metadata to ./faiss_vectorstore
[INFO] Loaded Faiss index and metadata from ./faiss_vectorstore
[{'index': np.int64(143), 'distance': np.float32(1.039881), 'metadata': None}, {'index': np.int64(149), 'distance': np.float32(1.039881), 'metadata': None}, {'index': np.int64(87), 'distance': np.float32(1.5778074), 'metadata': None}]


In [44]:
print(vector_store.query("Real-time CCTV-based PPE Detection",top_k=3))

[{'index': np.int64(143), 'distance': np.float32(1.039881), 'metadata': None}, {'index': np.int64(149), 'distance': np.float32(1.039881), 'metadata': None}, {'index': np.int64(87), 'distance': np.float32(1.5778074), 'metadata': None}]


In [46]:
from langchain_groq import ChatGroq

class RAGSearch:
    def __init__(self,persistent_directory : str = "./faiss_vectorstore", embed_model: str = "all-MiniLM-L12-v2", llm_model: str = "gemma2-9b-it"):
        self.vectorstore = vectorstore_creator(persistent_directory, embed_model)

        faiss_path = os.path.join(persistent_directory, "faiss.index")
        meta_path = os.path.join(persistent_directory, "metadata.pkl")

        if not (os.path.exists(faiss_path) and os.path.exists(meta_path)):
            docs = document_loader("./rag_data")
            self.vectorstore.from_documents(docs)
        else:
            self.vectorstore.load()
        groq_api_key = os.getenv("GROQ_API_KEY")
        self.llm = ChatGroq(groq_api_key=groq_api_key, model_name=llm_model)
        print(f"[INFO] Groq LLM initialized: {llm_model}")
    

    def search_and_summarize(self, query: str, top_k: int = 5) -> str:
        results = self.vectorstore.query(query, top_k=top_k)
        texts = [r["metadata"].get("text", "") for r in results if r["metadata"]]
        context = "\n\n".join(texts)
        if not context:
            return "No relevant documents found."
        prompt = f"""Summarize the following context for the query: '{query}'\n\nContext:\n{context}\n\nSummary:"""
        response = self.llm.invoke([prompt])
        return response.content

if __name__ == "__main__":
    rag_search = RAGSearch()
    query = "What is Ppe detection?"
    summary = rag_search.search_and_summarize(query, top_k=3)
    print("Summary:", summary)

[INFO] Loaded Faiss index and metadata from ./faiss_vectorstore


GroqError: The api_key client option must be set either by passing api_key to the client or by setting the GROQ_API_KEY environment variable

In [None]:
https://drive.google.com/file/d/1NTa_RiSHFz2tgglo1MO2Uzlxz3-sMZRY/view?usp=sharing

https://drive.google.com/file/d/1S-Nw1WSp_GngcPFZt354ebS8VPBD-DI9/view?usp=sharing



https://drive.google.com/file/d/1S-Nw1WSp_GngcPFZt354ebS8VPBD-DI9/view?usp=sharing