Importing necessary libraries

In [1]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.tools.retriever import create_retriever_tool
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.agents import AgentExecutor, Tool, create_tool_calling_agent
from langchain_core.prompts import PromptTemplate
import time

  from tqdm.autonotebook import tqdm


Setting Environment Variables from .env File

In [2]:
# Load environment variables from a .env file
load_dotenv()

# Set the GROQ_API_KEY environment variable to the value retrieved from .env file
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

# Set the COHERE_API_KEY environment variable to the value retrieved from .env file
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")

# Set the PINECONE_API_KEY environment variable to the value retrieved from .env file
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

# Enable Langsmith tracking by setting the LANGCHAIN_TRACING_V2 environment variable to "true"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# Set the LANGCHAIN_API_KEY environment variable to the value retrieved from .env file
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")


Set up a Wikipedia API wrapper with customized parameters

In [3]:
# Define the Wikipedia tool
wiki_tool = WikipediaQueryRun(
    api_wrapper=WikipediaAPIWrapper(
        top_k_results=1,
        load_all_available_meta=False,
        doc_content_chars_max=500
    )
)

Define a tool object for accessing Wikipedia information

In [4]:
wiki_tool= Tool(
        name='Wikipedia',
        description='look up things in wikipedia for knowing about food recipes, cooking instructions and their history',
        func=wiki_tool.invoke
    )

Setting Up Path to 'data' Directory Located in Parent Directory

In [5]:
# Get the current working directory
current_path = Path().resolve()

# Get the parent directory of the current working directory
parent_path = current_path.parent

# Construct the path to the 'data' directory located in the parent directory
data_path = parent_path / 'data'

Iterate through files in the specified data path

In [6]:
pdf_files = []  # Initialize an empty list to store PDF files.
for file in data_path.iterdir():  # Iterate through each file in the data path.
    if file.is_file() and file.name.endswith('.pdf'):  # Check if the file is a regular file and has a .pdf extension.
        pdf_files.append(file)  # If the file meets the criteria, append it to the list of PDF files.

In [7]:
pdf_files

[WindowsPath('E:/Learnings/College_project/data/BHM-401T.pdf')]

Loading PDF Documents into a List

In [8]:
docs = []  # Initialize an empty list to store loaded documents.

for filepath in pdf_files:  # Iterate through each filepath in the list of PDF files.
    loader = PyPDFLoader(filepath)  # Create a PyPDFLoader object for the current filepath.
    docs.append(loader.load())  # Load the document using the PyPDFLoader and append it to the list of documents.

Splitting Loaded PDF Documents into Chunks


In [9]:
documents = []  # Initialize an empty list to store the chunks of documents.

# Initialize a text splitter with specified chunk size and overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

for doc in docs:  # Iterate through each loaded document in the docs list.
    splitted_docs = text_splitter.split_documents(doc)  # Split the document into chunks using the text splitter.
    documents.append(splitted_docs)  # Append the list of chunks to the documents list.

In [10]:
len(documents)

1

Initializing the Embedding Model


In [11]:
embedding_model= CohereEmbeddings()

Extracting Page Contents from Document Chunks


In [12]:
contents = []  # Initialize an empty list to store page contents of documents.

for document in documents:  # Iterate through each document chunk in the documents list.
    page_content = []  # Initialize an empty list to store page contents of the current document.
    
    # Iterate through each page in the current document chunk.
    for page in range(0, len(document)):
        page_content.append(document[page].page_content)  # Append the page content to the page_content list.
    
    contents.append(page_content)  # Append the page_content list to the contents list.

Initializing Pinecone 


In [151]:
pc = Pinecone()

Configuring Pinecone Index Settings


In [152]:
# Define the name of the Pinecone index.
index_name = "rasoiguru"  
# Retrieve the cloud provider from environment variables or default to 'aws'.
cloud = os.environ.get('PINECONE_CLOUD') or 'aws' 
# Retrieve the region from environment variables or default to 'us-east-1'.
region = os.environ.get('PINECONE_REGION') or 'us-east-1'  
# Create a serverless specification object with the specified cloud and region.
spec = ServerlessSpec(cloud=cloud, region=region) 

Checking and Creating Pinecone Index


In [153]:
if index_name not in pc.list_indexes().names():  # Check if the specified index name does not exist.
    # Create a new index with specified parameters.
    pc.create_index(
        index_name,  # Specify the name of the index to be created.
        dimension=4096,  # Specify the dimension of the vector embeddings.
        metric='cosine',  # Specify the distance metric for similarity search.
        spec=spec  # Specify the serverless specification for deployment.
    )

Accessing and Describing Pinecone Index


In [154]:
index = pc.Index(index_name)  # Access the Pinecone index with the specified index name.

# Wait briefly for connection before describing index stats.
time.sleep(1)

index.describe_index_stats()  # Describe the statistics of the Pinecone index.

{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'nsBHM-401T': {'vector_count': 245}},
 'total_vector_count': 245}

Generating Namespace Names for PDF Files

In [155]:

# Generate namespace names for each PDF file.
ns = ["ns" + path.stem for path in pdf_files]

Creating or Retrieving Vector Stores


In [156]:
vectorstores = []  # Initialize an empty list to store vector stores.

if index.describe_index_stats()['total_vector_count'] == 0:  # Check if no vectors are present in the index.
    # Create vector stores from document contents.
    for namespace, content in zip(ns, contents):  # Iterate through namespaces and corresponding document contents.
        vectorstore = PineconeVectorStore.from_texts(
            texts=content,  # Provide the document contents as texts.
            index_name=index_name,  # Specify the name of the Pinecone index.
            embedding=embedding_model,  # Specify the embedding model for generating vectors.
            namespace=namespace  # Specify the namespace for the vector store.
        )
        vectorstores.append(vectorstore)  # Append the created vector store to the list.

else:
    # Retrieve vector stores from an existing index.
    for namespace in ns:  # Iterate through namespaces.
        vectorstore = PineconeVectorStore.from_existing_index(
            index_name,  # Specify the name of the Pinecone index.
            embedding_model,  # Specify the embedding model used in the index.
            namespace=namespace  # Specify the namespace for the vector store.
        )
        vectorstores.append(vectorstore)  # Append the retrieved vector store to the list.

In [157]:
vectorstores

[<langchain_pinecone.vectorstores.PineconeVectorStore at 0x239a029b7f0>]

Converting Vector Stores to Retrievers


In [158]:
retrievers = []  # Initialize an empty list to store retrievers.

for vectorstore in vectorstores:  # Iterate through each vector store.
    retriever = vectorstore.as_retriever()  # Convert the vector store to a retriever.
    retrievers.append(retriever)  # Append the retriever to the list.

In [159]:
retrievers

[VectorStoreRetriever(tags=['PineconeVectorStore', 'CohereEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x00000239A029B7F0>)]

Creating Search Tools


In [160]:
tools = []  # Initialize an empty list to store search tools.

# Lists containing tool names and descriptions
tools_name = [
    'BHM-401T_pdf_search',
    # 'Book1_pdf_search',
    # 'Professional_Cooking_pdf_search',
    # 'USU-Student-Cookbook-FINAL-1_pdf_search'
]

tools_desc = [
    "Indian food cooking and heritage related information use this tool",
    # "For information related to any ingredient use this tool",
    # "For professional cooking techniques, sanitization and safety in kitchen and food presentation tips use this tool",
    # "For specific information about quick recipes for students and seasonal grocery shopping use this tool"
]

# Create search tools for each retriever and append them to the tools list
for name, desc, retv in zip(tools_name, tools_desc, retrievers):
    pdf_tool = create_retriever_tool(retv, name, desc, document_prompt="Search the query")  # Create a search tool for each retriever
    tools.append(pdf_tool)  # Append the search tool to the tools list

# Append the Wikipedia tool to the tools list
tools.append(wiki_tool)  # Append the Wikipedia tool to the tools list

In [193]:
len(tools)

2

Initializing Large Language Model

In [194]:
llm = ChatGroq(model="mixtral-8x7b-32768")

Setting System Instructions

In [195]:
system_instruction= """
You are a helpful cooking assistant named Rasoiguru.\
Greet the user\
Answer the following questions as best you can in terms of a passionate and helpful  professional cooking assistant\
"""

Prompt Format Definition for Agent Responses


In [196]:
format= """
Use the following format:

Use the chat history which will be provided to you for understanding the context of the most recent conversation incase user query is not clearly defined\
Question: the input question you must answer\
Thought: you should always think about what to do\
Action: the action to take, should be one of the provided tools\
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
## You need to always give an final answer

Remember to answer as a compansionate professional cooking assistant when giving your final answer.
"""

Prompt Template Configuration


In [197]:
prefix = f"""You have access to the following tools:
Tools:
{tools}
Instruction:
{system_instruction}.
"""

suffix = """Begin!Now answer the question
{intermediate_steps}
Chat history:
{chat_history}
Question: {input}
{agent_scratchpad}
## In case the user query is not about food cooking, grocery shopping and history of food the  reply\
then reply I do not know the answer to your question.
"""

prompt = PromptTemplate(
    input_variables=["input", "chat_history", "intermediate_steps", "agent_scratchpad"],
    template= prefix + format + suffix
)

In [203]:
prompt.template

"You have access to the following tools:\nTools:\n[Tool(name='BHM-401T_pdf_search', description='Indian food cooking and heritage related information use this tool', args_schema=<class 'langchain_core.tools.RetrieverInput'>, func=functools.partial(<function _get_relevant_documents at 0x00000239A23A41F0>, retriever=VectorStoreRetriever(tags=['PineconeVectorStore', 'CohereEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x00000239A029B7F0>), document_prompt='Search the query', document_separator='\\n\\n'), coroutine=functools.partial(<function _aget_relevant_documents at 0x00000239A23A43A0>, retriever=VectorStoreRetriever(tags=['PineconeVectorStore', 'CohereEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x00000239A029B7F0>), document_prompt='Search the query', document_separator='\\n\\n')), Tool(name='Wikipedia', description='look up things in wikipedia for knowing about food recipes, cooking instructions a

Agent Creation with Tool Integration


In [199]:
agent= create_tool_calling_agent(llm,tools= tools,  prompt= prompt)

Memory Integration for Agent Interaction


In [200]:
memory= ConversationBufferWindowMemory(k=3,return_messages=True, memory_key= "chat_history")

Agent Executor Initialization

In [201]:
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=memory)


Agent Executor: Task Invocation 


In [202]:
result = agent_executor.invoke({"input": "classification of food based on vargas"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `BHM-401T_pdf_search` with `{'query': 'classification of food based on vargas'}`


[0m

AttributeError: 'str' object has no attribute 'input_variables'

In [171]:
result

{'input': 'how to make sandwich',
 'chat_history': [],
 'output': 'Thought: The user is asking how to make a sandwich. I should provide them with instructions on how to make a sandwich.\n\nAction: I will use the Wikipedia tool to gather more information about sandwiches.\n\nAction Input: sandwich\n\nObservation:\n\nPage: Sandwich\nSummary: A sandwich is a dish typically consisting of meat, cheese or vegetables used as a filling between slices of bread, or placed atop a slice of bread; or, more generally, any dish in which bread serves as a container or wrapper for another food type, and allows it to be a finger food. The sandwich began as a portable, convenient food in the Western world, though over time it has become prevalent worldwide.\nThere has been social media debate over the precise definition of sandwich.\n\nThought: Now that I have more information about sandwiches, I can provide the user with instructions on how to make a sandwich.\n\nFinal Answer: To make a sandwich, you wi

Extracting the Final Answer from a Result String

In [172]:
# Convert the 'output' field from the 'result' dictionary to a string
result = str(result)

# Define the marker indicating the start of the final answer
start_marker = "Final Answer:"

# Find the index where the start_marker appears in the result string
start_index = result.find(start_marker)
print(start_index)  # Print the index of the start_marker (for debugging purposes)

# Check if the start_marker is found in the result string
if start_index != -1:
    # Extract the substring that comes after the start_marker and strip any leading/trailing whitespace
    result = result[start_index + len(start_marker):].strip()
else:
    # If the start_marker is not found, set result to an empty string
    result = ""

# Print the final extracted result
print(result)

958
To make a sandwich, you will need two slices of bread. Spread your desired condiments on one or both slices. Place a filling of your choice, such as meat, cheese, or vegetables, on one slice. Top it off with the second slice of bread. Press down gently, cut the sandwich in half diagonally if desired, and serve. Enjoy your meal!'}
