In [8]:
# Importimg dependancy
import os

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader 
from langchain_groq import ChatGroq

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [9]:
# Loading .env
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
# Api Keys and embbeding model
Data_path = "D:\Desktop\RAG_Project\data"
Embedding_model = os.getenv("EMBEDDING_MODEL")
llm_api_key = os.getenv("GROQ_API_KEY")
llm_model = os.getenv("LLM_MODEL")

In [11]:
# Setting the llm
llm = ChatGroq(
    model_name = llm_model,
    groq_api_key = llm_api_key,
    temperature = 0.0 # for testing our knowledge base am setting it 0 for no hallucinations
)

In [12]:
# Loading our document 

loader_map = {
    ".pdf": (PyPDFLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf-8"}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
}

try:
    documents = []

    for ext, (loader_class,kwargs) in loader_map.items():

        temp_loader = DirectoryLoader(
            path=Data_path,
            glob=f"**/*{ext}",
            loader_cls=loader_class,
            loader_kwargs=kwargs
        )
        documents.extend(temp_loader.load())

except Exception as e:
    print(f"Error during document loading: {e}")
    print("Ensure all required packages (PyPdf, unstructed, docx2txt)are installed")

print(f"\nSuccessfully loaded a total of {len(documents)} initial documents (pages/files).")


Successfully loaded a total of 31 initial documents (pages/files).


In [15]:
# Chunking

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

chunk = text_splitter.split_documents(documents)
print(f"Total chunks created from all file: {len(chunk)}")

Total chunks created from all file: 240


In [None]:
# Vectorization and store creation
embedding_model = Embedding_model
vectore_store = FAISS.from_documents(chunk,embedding_model)
print("FAISS Vector Store created successfully from all documents!")

In [None]:
# Testing
test_query = "Which Diet is good for PCOS"
retriever = vectore_store.as_retriever(search_kwargs={"k": 2})

retrieved_docs = retriever.invoke(test_query)

print(f"Verification : Retrieved from documnets for {test_query}")

for i,doc in enumerate(retrieved_docs):
    source_path = doc.metadata('source','Unknown')
    print(f"Document {i+1} Source: {os.path.basename(source_path)}")
    print(f"Content Snippet: {doc.page_content[:200]}")