In [1]:
##This code snippet is importing various libraries and modules in Python for different functionalities related
## to natural language processing (NLP) and machine learning tasks. Here is a breakdown of what each import statement is doing:



import os
from uuid import uuid4
import time
from pinecone import Pinecone
import docx
from typing import List
from langchain_pinecone import PineconeVectorStore

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.retrievers import BM25Retriever, MultiVectorRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.evaluation.qa import QAEvalChain
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv
from pinecone import ServerlessSpec
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
load_dotenv()
PINECONE_KEY = os.getenv("PINECONE_APIKEY")
PINECONE_ENV = os.getenv("PINECONE_ENVIRONMENT")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
##This code snippet is performing the following tasks:
# Load a PDF and validate the page count
file_path = "W:\\Assigment\\2023050195.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

if len(pages) < 200:
    raise ValueError("The PDF must have at least 200 pages.")

#Using semantic chunking with recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(pages)


# Hugging Face embedding
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [3]:
pinecone = Pinecone(api_key=PINECONE_KEY)
index_name = "rag-index-new"

if not pinecone.has_index(index_name):
    pinecone.create_index(index_name, dimension=384, metric="cosine",spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ))
index = pinecone.Index(index_name)


In [4]:
chunk_texts = [chunk.page_content for chunk in chunks]
docs=[]
uuids=[]


In [5]:
for chunks in chunk_texts:
    uuids.append(str(uuid4()))
    doc=Document( page_content=chunks,metadata={"source":"Constitution of India"})
    docs.append(doc)


In [6]:
from langchain.vectorstores import Pinecone as LC_Pinecone


In [7]:
docssearch=PineconeVectorStore(index=index,embedding=embedding_model)
#docsearch = LC_Pinecone.from_documents(documents=chunks, embedding=embedding_model, index_name=index_name)

In [8]:
docssearch.add_documents(documents=docs,ids=uuids)

['0b0111a0-de83-47d7-a994-e39ede63dd23',
 '4fbd2e1d-2b8c-454e-be7f-def7c6b08bcb',
 '5b753139-5eeb-4bfe-a4e0-b5e71c163a25',
 '512686fa-5d7c-46d7-9dc5-c8692ea02220',
 '8a2e87e1-64fe-4d3b-b7d7-402bfcb5824e',
 'fd802aaa-426c-48cf-ad1d-621620337b33',
 '784979a4-93b1-4e71-92f8-3addfbd25369',
 'a3b7609c-713f-406f-a15c-23349a01c57e',
 'a1cea0eb-83c7-4dc7-9860-93f76f193ac4',
 'ef67f262-dbff-49fb-8a8e-52dc7b2aae66',
 'f588c409-fe5c-4bc5-bf28-4f9c3f6d3555',
 '7c412bae-60ee-4f78-b911-239b954d916a',
 'a1945579-33ad-414d-843e-708cd9692b9d',
 '5fdb7986-b67c-4759-b868-5839c4ebefc1',
 '15ce6867-43c6-42b8-9f04-2fb084cb1b9d',
 'd0d32945-e4ed-4e49-8b4c-948d7b75a30b',
 '08e45dd0-37b0-4a03-9603-6c86f4a86f45',
 'd9df6c2c-b35d-47cd-ac3e-dc684b2de2b6',
 '0293f976-eba0-46af-9743-94c441ae2301',
 'd19cc3fd-886a-4461-938e-288a0f4f1545',
 '0576ce19-24db-4fef-b42d-98f2b839f252',
 '810f162a-901a-4bbe-854a-0e1549ef8a79',
 '13416d49-f965-4307-bf38-92243a0fc112',
 '384cd48f-ffa3-44cd-8770-813068f5e7b1',
 '5fc391b5-d684-

In [14]:
from langchain.schema import Document

chunks = [Document(page_content=chunk) for chunk in chunks]


In [15]:
from langchain.vectorstores import FAISS

faiss_flat = FAISS.from_documents(chunks, embedding_model)
faiss_ivf = FAISS.from_documents(chunks, embedding_model)  # IVF only by clustering before indexing (simulated)


In [16]:
retriever_pinecone = docssearch.as_retriever()
retriever_flat=faiss_flat.as_retriever()
retriever_ivf=faiss_ivf.as_retriever()

In [None]:
##This code snippet is measuring the time taken to retrieve documents related to the query 
##"What are the fundamental rights in the Constitution of India?" using different retriever methods.


query="What are the fundamental rights in the Constitution of India?"
def measure_time(retriever):
    start = time.time()
    docs = retriever.invoke(query)
    return time.time() - start, docs

pine_time, pine_docs = measure_time(retriever_pinecone)
flat_time, flat_docs = measure_time(retriever_flat)
ivf_time, ivf_docs = measure_time(retriever_ivf)

print(f"Pinecone (HNSW): {pine_time:.3f}s | Flat: {flat_time:.3f}s | IVF: {ivf_time:.3f}s")



Pinecone (HNSW): 1.433s | Flat: 0.049s | IVF: 0.001s


In [18]:
print(pine_docs)

[Document(id='b240d17c-733e-4808-8f67-b096d73bffc3', metadata={'source': 'Constitution of India'}, page_content='THE CONSTITUTION OF  INDIA \n(Part III.—Fundamental Rights) \n19\nRight  to Constitutional Remedies \n32. Remedies for enforcement of rights conferred by this Part.—(1) \nThe right to move the Supreme Court by appropriate proceedings for the \nenforcement of the rights conferred by this Part is guaranteed. \n(2) The Supreme Court shall have power to issue directions or orders or \nwrits, including writs in the nature of habeas corpus, mandamus, prohibition,'), Document(id='c2dbe75b-b06f-45a7-86f9-7899e6ac68a5', metadata={'source': 'Constitution of India'}, page_content='THE CONSTITUTION OF  INDIA \n(Part III.—Fundamental Rights) \n19\nRight  to Constitutional Remedies \n32. Remedies for enforcement of rights conferred by this Part.—(1) \nThe right to move the Supreme Court by appropriate proceedings for the \nenforcement of the rights conferred by this Part is guaranteed. \n

In [19]:
print(flat_docs)

[Document(id='42e75f17-91f5-43cd-bd3a-de6fc62bd50b', metadata={}, page_content='r'), Document(id='f8e81f58-3503-4364-bf99-af2521e35acd', metadata={}, page_content='r'), Document(id='82cd5553-accd-4263-9237-3ff7044824d2', metadata={}, page_content='r'), Document(id='3f752235-77dc-46b5-a15a-44b228856003', metadata={}, page_content='r')]


In [None]:
##This code snippet defines a function `compute_accuracy` that calculates the similarity scores between a query and a list of documents using a pre-trained SentenceTransformer model. 
#Here's a breakdown of what the code does:
def compute_accuracy(query, docs):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_emb = model.encode(query)
    doc_embs = model.encode([doc.page_content for doc in docs])
    print("Calculating Similarity Scores")
    sims = cosine_similarity([query_emb], doc_embs)[0]
    return float(np.max(sims))

print("Accuracy Scores:")
print("Pinecone:", compute_accuracy(query, pine_docs))
print("Flat:", compute_accuracy(query, flat_docs))
print("IVF:", compute_accuracy(query, ivf_docs))


Accuracy Scores:
Calculating Similarity Scores
Pinecone: 0.7402980327606201
Calculating Similarity Scores
Flat: 0.14819082617759705
Calculating Similarity Scores
IVF: 0.14819082617759705


In [24]:
# BM25 reranker example
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 5
bm25_docs = bm25.get_relevant_documents(query)


In [25]:
prompt_template=PromptTemplate(
    input_variables=["context", "question"],
    template="You are an assistant. Use the following context to answer the question.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
)

In [26]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [29]:
import os
os.environ['GEMENI_APIKEY']=os.getenv("GEMENI_APIKEY")

In [32]:
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash',google_api_key=os.environ["GEMENI_APIKEY"])

rag_chain = (
    {"context": retriever_pinecone | format_docs, "question": RunnablePassthrough()} | prompt_template | model | StrOutputParser()
)


In [33]:
result=rag_chain.invoke("What are the fundamental duties.")

In [34]:
result

'Based on the provided text, the fundamental duties of every citizen of India are:\n\n(a) To abide by the Constitution and respect its ideals and institutions, the National Flag and the National Anthem.\n(b) To cherish and follow the noble ideals which inspired our national struggle for freedom.\n(c) To uphold and protect the sovereignty, unity and integrity of India.\n(d) To defend the country and render national service when called upon to do so.'

In [35]:
doc=docx.Document()
doc.add_heading("Rag chain results",0)
doc.add_paragraph(result)
doc.save("Ragresults.docx")

In [36]:
result1=rag_chain.invoke("What is the procedure to amend the Constitution of India?")

In [43]:
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE

# Load an existing DOCX file
doc1 = Document("Ragresults.docx")

# Add a new heading
doc1.add_heading("New RAG Output Section", level=1)

# Add a new paragraph with custom formatting
result_text = str(result1)  # Make sure result is a string

# Add paragraph
para = doc1.add_paragraph()
run = para.add_run(result_text)





# Save (overwrite the same file)
doc1.save("Ragresults.docx")
