In [1]:
from typing import TypedDict, List , Annotated
from langgraph.graph import StateGraph
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_tavily import TavilySearch
from langchain_core.messages import ToolMessage,HumanMessage,AIMessage,SystemMessage
from langchain_core.tools import tool
import os
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.types import Command, interrupt
from dotenv import load_dotenv

load_dotenv()

True

In [2]:

def document_load_embed_store(filepath: str) -> str:
    """
    Loads a PDF document from a given file path using the PyMuPDFLoader.
    Splits input documents into smaller chunks, embeds them using a sentence transformer model,
    and stores the resulting vector representations in a persistent Chroma vector database.


    Args:
        filepath (str): The path to the PDF file.

    Returns:
         str: Confirmation message indicating successful storage of embeddings

    Raises:
        FileNotFoundError: If the specified file does not exist.
        ValueError: If the file path is not a PDF.
    """
    # Remove quotes if filepath is like "'file.pdf'" or '"file.pdf"'
    print("document_loader tool is called")
    clean_path = filepath.strip('"').strip("'")

    if not os.path.exists(clean_path):
        raise FileNotFoundError(f"File not found: {clean_path}")

    if not clean_path.lower().endswith(".pdf"):
        raise ValueError("Only PDF files are supported for loading.")

    loader = PyMuPDFLoader(clean_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n"],
    chunk_size=500,
    chunk_overlap=50
        )
    
    chunks = text_splitter.split_documents(documents)
    try:
        
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vectorstore = Chroma(
            embedding_function=embeddings,
            persist_directory="chroma_index"
        )
        vectorstore.add_documents(chunks)
        vectorstore.persist()
    except Exception as e:
        raise RuntimeError(f"Failed to embed and store documents: {e}")

    
    return "Documents have been embedded and stored in the Chroma vector database."


In [3]:
@tool
def retrieve_similar_documents(query: str) -> List[str]:
    """
    Loads a persisted Chroma vector database from disk and performs a similarity search
    based on a natural language query.

    Args:
        query (str): The user's search query to find similar documents.
        persist_dir (str, optional): Directory from which to load the Chroma vectorstore. Defaults to "chroma_index".
        top_k (int, optional): Number of top similar documents to retrieve. Defaults to 4.

    Returns:
        List[str]: A list of the page content strings of the most relevant documents.
    """
    print("retrieve_similar_documents tool is called")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma(
        embedding_function=embeddings,
        persist_directory="chroma_index"
    )
    
    results = vectorstore.similarity_search(query, k=4)
    return [doc.page_content for doc in results]


In [4]:
@tool
def websearch(query:str)-> List[str]:
    """
    Perform a web search using TavilySearch tool.
    
    Args:
        query (str): The search query.
        
    Returns:
        List[str]: A list of content from the search results.
    """
    print("websearch tool is called")
    
    search=TavilySearch(
        max_results=5,
        topic="general",
    )
    results=search.invoke({"query": query})
    content=[]
    for i in results["results"]:
        content.append(i["content"])
    return content



In [5]:
memory=MemorySaver()
config = {"configurable": {"thread_id": "1"}}

In [6]:
class State(TypedDict):
    messages:Annotated[list,add_messages]

In [7]:
tools=[document_load_embed_store,retrieve_similar_documents,websearch]

In [8]:
llm=ChatGoogleGenerativeAI(model='gemini-2.0-flash').bind_tools(tools)

In [9]:
def chatmodel(state:State):
    output = llm.invoke(state["messages"])
    return {"messages": state["messages"] + [output]}

In [10]:
graph_builder=StateGraph(State)
tool_node=ToolNode(tools=tools)

In [11]:
graph_builder.add_node("chatbot",chatmodel)
graph_builder.add_node("tools",tool_node)

graph_builder.add_edge(START,"chatbot")
graph_builder.add_conditional_edges(
    "chatbot",
    tools_condition,
)
graph_builder.add_edge("tools","chatbot")
graph=graph_builder.compile(checkpointer=memory)

In [12]:
msg = SystemMessage(content="""
Document Upload Procedure: When a user asks to upload a document, you must perform these steps in sequence:
* First, call the document_load_embed_store tool to load the file.
when asked question related to the document, you must first call the retrieve_similar_documents tool to fetch relevant content.                   
* For any document-related questions, always base your answers solely on the retrieved content from the documents, not your own knowledge.
* If the user asks a question that is not related to the document, you can use the websearch tool to find relevant information.
 When handling user queries that require retrieving information from multiple documents:

If a single tool  must be used multiple times, you must:

Call the tool sequentially, passing only one query at a time

Wait for the response from each call before proceeding to the next
When multiple documents are uploaded and the user asks a question that may relate to more than one of them, you must:

Invoke the retriever tool separately for each document by issuing the query one at a time.

Wait for each retriever tool response to complete before sending the next query.

Accumulate the context from each document response.

Once all responses are received, synthesize a unified, accurate answer for the user's original query based on the combined context.

Accumulate and merge context from all results before generating a final response

If multiple tools are required to gather information , you may call them in parallel where appropriate to optimize for time.
""")

In [13]:
user_input=input("Enter your message to llm :")

while user_input.lower() != "end" :
    conver=HumanMessage(content=user_input)
    print("USER :", user_input)
    response=graph.invoke({"messages":[msg]+[conver]},config = config)
    print("AI :",response['messages'][-1].content)
    user_input=input("YOU :")

USER : Hi Iam Dayakar
AI : Hi Dayakar! How can I help you today?
USER : i want to upload a document and ask questions about it
AI : Okay, I can help you with that. Please provide the file path of the document you would like to upload.
USER : "C:\Users\anush\OneDrive\Desktop\practice\lang\own_project\pdfs\project.pdf"
document_loader tool is called


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  vectorstore = Chroma(
  vectorstore.persist()


AI : Great! The document has been uploaded, embedded, and stored. What questions do you have about the document?
USER : what does the document content is about
retrieve_similar_documents tool is called
AI : Based on the retrieved content, the document appears to be a project report titled "PERFORMANCE EVALUATION OF R-290 REFRIGERANT WITH GRAPHENE NANO PARTICLES." It includes acknowledgements, a declaration, table of contents, and lists of tables and figures. The project was carried out by Saiteja Bojja, Dayakar Gone, Karthikeya Kola, and Mahesh Budala under the guidance of Dr. M Shailaja at JNTUH University College of Engineering Sultanpur.
USER : yeah great tell me the process they followed
retrieve_similar_documents tool is called
AI : Based on the retrieved information, the project involved the "PERFORMANCE EVALUATION OF R-290 REFRIGERANT WITH GRAPHENE NANO PARTICLES". The document includes a description of the VCR (Vapor Compression Refrigeration) system, including the compression 

In [14]:
for i in response['messages']:
    i.pretty_print()



Document Upload Procedure: When a user asks to upload a document, you must perform these steps in sequence:
* First, call the document_load_embed_store tool to load the file.
when asked question related to the document, you must first call the retrieve_similar_documents tool to fetch relevant content.                   
* For any document-related questions, always base your answers solely on the retrieved content from the documents, not your own knowledge.
* If the user asks a question that is not related to the document, you can use the websearch tool to find relevant information.
 When handling user queries that require retrieving information from multiple documents:

If a single tool  must be used multiple times, you must:

Call the tool sequentially, passing only one query at a time

Wait for the response from each call before proceeding to the next
When multiple documents are uploaded and the user asks a question that may relate to more than one of them, you must:

Invoke the re