In [1]:
!pip install -Uq langchain langchain-openai

In [None]:
import os
from langchain_openai import AzureChatOpenAI
os.environ["OPENAI_API_VERSION"] = "2024-12-01-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-agents-sept-cohort-resource.cognitiveservices.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "api_key"

os.environ["PINECONE_API_KEY"]='pinecone_key'

In [4]:
llm = AzureChatOpenAI(
  deployment_name = "gpt-4.1",
  temperature=1,
  top_p=0.8
)

In [5]:
llm.invoke('hello')

In [6]:
! wget https://vidvattarecordings.blob.core.windows.net/pdfs/0132667023-HDFC-Life-Group-Unit-Linked-Future-Secure-Plan.pdf
! wget https://vidvattarecordings.blob.core.windows.net/pdfs/HDFC-GROUP-GRATUITY-BROCHURE.pdf
! wget https://vidvattarecordings.blob.core.windows.net/pdfs/HDFC-LIFE-Group-Variable-Employee-Benefit-Plan.pdf
! wget https://vidvattarecordings.blob.core.windows.net/pdfs/HDFC-Life-Group-Term-Insurance-Plan.pdf



In [7]:
! ls

In [26]:
! pip install -qU langchain-community langchain-pymupdf4llm langchain-pinecone 

In [13]:
import os
pdf_files = [file for file in os.listdir() if '.pdf' in file.lower()]
pdf_files

In [14]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
def pdf_document_loader(file_path):
  loader = PyMuPDF4LLMLoader(file_path)
  docs = loader.load()
  print(len(docs))
  return docs

parsed_docs = []
for pdf_file in pdf_files:
  docs = pdf_document_loader(pdf_file)
  parsed_docs.extend(docs)

In [15]:
len(parsed_docs)

In [30]:
parsed_docs

In [None]:
## Recursive Character Splitting

# from langchain_text_splitters import RecursiveCharacterTextSplitter

# splitter_recursive = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=200,
# )
# docs_splits = splitter_recursive.split_documents(parsed_docs)
# print(f"Chunks with RecursiveCharacterTextSplitter: {len(docs_splits)}")

In [23]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
)

In [25]:
len(embeddings.embed_query('hello'))

In [28]:
# Pinecone vector db

from pinecone import Pinecone

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

index = pc.Index('policy-agenticrag')

In [29]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [34]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(parsed_docs))]

vector_store.add_documents(documents=parsed_docs, ids=uuids)

In [42]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.4},
)
retrieved_docs = retriever.invoke("what is the age eligibility for term insurance")#, filter={"source": "news"})

In [44]:
for doc in retrieved_docs:
  print(doc.metadata['file_path'])
  print(doc.page_content)
  print('*'*100)

In [61]:
def get_chunks(query):
  """This function helps in getting the chunks from the rag db based on the query"""
  retriever = vector_store.as_retriever(
      search_type="similarity_score_threshold",
      search_kwargs={"k": 5, "score_threshold": 0.4},
  )
  retrieved_docs = retriever.invoke(query)#, filter={"source": "news"})
  return ' '.join([docs.page_content for docs in retrieved_docs])




In [49]:
user_query = 'what is the age eligibility for term insurance and who should be contacted for Grievance'
chunk_text = get_chunks(user_query)

response = llm.invoke(f'user_query : {user_query}, context: {chunk_text}, answer user_query based on chunk text')

In [50]:
print(response.content)

In [58]:
from langchain.agents import create_agent

agent = create_agent(llm, 
                     tools=[get_chunks],
                     system_prompt="""you are a agentic rag agent, 
                     your goal is answer queries related to insurance policy document, 
                     if user asks anythin beyond this, you say 'not supported'""")

In [53]:
response = agent.invoke({
  'messages': user_query
})
response


In [59]:
user_query = 'what is the age eligibility for term insurance and who should be contacted for Grievance'

for messages in agent.stream({
  'messages': user_query
}):
  print(messages)

In [64]:
from langchain.agents import create_agent

def get_chunks_with_metadata(query, filename):
  """This function helps in getting the chunks from the rag db based on the query"""
  retriever = vector_store.as_retriever(
      search_type="similarity_score_threshold",
      search_kwargs={"k": 5, "score_threshold": 0.4},
  )
  retrieved_docs = retriever.invoke(query, filter={"file_path": filename})
  return ' '.join([docs.page_content for docs in retrieved_docs])

def get_filenames_of_policies(query):
  """Get the filename for the given policy, the query can take any 1 value from below, use NA if it does not match any list here

  term insurance
  variable employee benefit
  group gratuity brochure
  future secure plan
  NA
  
  """
  policy_file_references = {'future secure plan': '0132667023-HDFC-Life-Group-Unit-Linked-Future-Secure-Plan.pdf',
   'group gratuity brochure': 'HDFC-GROUP-GRATUITY-BROCHURE.pdf',
   'term insurance': 'HDFC-Life-Group-Term-Insurance-Plan.pdf',
   'variable employee benefit': 'HDFC-LIFE-Group-Variable-Employee-Benefit-Plan.pdf'}
  if query in policy_file_references.keys():
    return policy_file_references[query]
  else:
    return ''

agentic_rag_metadata = create_agent(llm, 
                     tools=[get_filenames_of_policies, get_chunks_with_metadata],
                     system_prompt="""you are a agentic rag agent, 
                     your goal is answer queries related to insurance policy document, 
                     if user asks anythin beyond this, you say 'not supported'
                     Tool call rules:
                     first use get_filenames_of_policies to idenityf policy document file name which can be used in  get_chunks_with_metadata
                     """)

In [67]:
agentic_rag_metadata.invoke({
  'messages': ['what is the age eligibility and who should be contacted for Grievance']
})