In [36]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq


In [37]:
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

# Initialize LLM
llm = ChatGroq(groq_api_key=groq_api_key, model="Gemma2-9b-It")
llm.invoke(input="What is the capital of France?")
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")






In [40]:
import os
import fitz  # PyMuPDF
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings

# HuggingFace embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Updated function to load PDFs using PyMuPDF (fitz)
def load_pdfs_from_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"Loading {file_path} with PyMuPDF...")
            try:
                doc = fitz.open(file_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text().strip()
                    if text:
                        all_docs.append(Document(page_content=text, metadata={
                            "source": filename,
                            "page": page_num
                        }))
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    return all_docs

# FAISS index persistence directory
PERSIST_DIR = "/home/rishulgupta/bigdata/faiss_index"

if not os.path.exists(PERSIST_DIR):
    os.makedirs(PERSIST_DIR)

print("Creating or loading FAISS vectorstore...")

index_path = os.path.join(PERSIST_DIR, "faiss_index")
metadata_path = os.path.join(PERSIST_DIR, "faiss_index.pkl")

if os.path.exists(index_path) and os.path.exists(metadata_path):
    vectorstore = FAISS.load_local(PERSIST_DIR, embeddings)
    print("Loaded existing FAISS vectorstore")
else:
    documents = load_pdfs_from_folder("/home/rishulgupta/bigdata/HR_policies")
    vectorstore = FAISS.from_documents(documents, embeddings)
    vectorstore.save_local(PERSIST_DIR)
    print("Created and saved new FAISS vectorstore")

# Perform similarity search
results = vectorstore.similarity_search("Streamlit")
for i, res in enumerate(results, 1):
    print(f"\nResult {i} (File: {res.metadata['source']} Page: {res.metadata['page']}):\n{res.page_content[:500]}...\n")


Creating or loading FAISS vectorstore...
Loading /home/rishulgupta/bigdata/HR_policies/Standard_Operating_Procedure_Technical_Vulnerability_Management_Standard_V.4.7.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Seperation_Policy.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/code-of-conduct-for-prevention-of-insider-trading-v1.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Data_Loss_Prevention_PolicyV1.1.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Investment-Policy-Final.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Acceptable_Use_Policy_V_2.2_29th_Oct_2024.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Disciplinary_Policy.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/Workforce_Redressal_Commitee_Guidelines_Employee.pdf with PyMuPDF...
Loading /home/rishulgupta/bigdata/HR_policies/UAE_policies_24V.1.pdf with PyMuPDF...
Loading /home/rishulgupta/big

In [44]:
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough




# retriever: expects question string, returns list of documents
retriever = RunnableLambda(lambda question: vectorstore.similarity_search(question, k=1))

# Define prompt template with proper message template
message = """
Answer all questions in detail, using the context provided. If the question is not answerable, say "I don't know".
Question:
{question}

Context:
{context}
"""

# Create prompt from human message template
prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(message)
])

# Compose the RAG chain
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

# Example query
query = "what are leave policy?"

response = rag_chain.invoke(query)
print(response.content)


I don't know. 

While the context mentions "LEAVE & ATTENDANCE POLICY", it doesn't provide any details about what a leave policy entails.  

