# Setup

In [1]:
from dotenv import load_dotenv
env = load_dotenv()

In [2]:
from langgraph.graph import StateGraph, END
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_openai import ChatOpenAI
from IPython.display import Image
import IPython



# Tools

In [3]:
from typing import TypedDict, Annotated, Dict, Any
import operator

class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], operator.add]

In [16]:
# Download PDF tool
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.pydantic_v1 import BaseModel, Field
from bs4 import BeautifulSoup
import tiktoken
import requests

# Search for relevant menu
search_tool = TavilySearchResults()


# Scrape menu data
class ScrapeInput(BaseModel):
    url: str = Field(description="the URL of the menu page")

@tool("scrape_pdf", args_schema=ScrapeInput, return_direct=True)
def scrape_pdf(url: str):
    """Scrape a webpage that may include links to a restaurants current menu and return the links"""
    pdf_links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        pdf_links = [link.get('href') for link in links if link.get('href').endswith('.pdf')]
    except Exception as e:
        print(f"failed to scrape {url} ERROR: {e}")
    return pdf_links

@tool("scrape_text", args_schema=ScrapeInput, return_direct=True)
def scrape_text(url: str):
    """Scrape the text directly from a website"""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        # Get text
        text = soup.get_text(separator=' ')
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
    except Exception as e:
        print(f"failed to scrape {url} ERROR: {e}")
        return None
    return text

# Encode & Upsert menu data
enc = tiktoken.encoding_for_model("gpt-4o")

class tokenizerInput(BaseModel):
    pdf_url: str = Field(description="The url of the PDF file that must be tokenized")

# Tokenize the pdf 
# @tool("tokenize-pdf", args_schema=tokenizerInput, return_direct=True)
# def tokenize_pdf():
#     pass

'\n# Tokenize the pdf \n@tool("tokenize-pdf", args_schema=tokenizerInput, return_direct=True)\ndef tokenize_pdf():\n    pass\n'

In [17]:
tools = [search_tool, scrape_pdf, scrape_text]

# RAG Model

In [18]:
class Model:
    # Define the model
    def __init__(self, model, tools, system=""):
        self.system = system
        graph = StateGraph(AgentState)
        graph.add_node("llm", self.openai_inference)
        graph.add_node("action", self.take_action)
        graph.add_conditional_edges(
            "llm",
            self.exists_action,
            {True: "action", False: END}
        )
        graph.add_edge("action", "llm")
        graph.set_entry_point("llm")
        self.graph = graph.compile()
        self.tools = {t.name: t for t in tools}
        self.model = model.bind_tools(tools)

    # Check if llm requires action 
    def exists_action(self, state: AgentState):
        result = state['messages'][-1]
        return len(result.tool_calls) > 0
    # Run a tool ordered by the model
    def take_action(self, state: AgentState):
        tool_calls = state['messages'][-1].tool_calls
        results = []
        for t in tool_calls:
            print(f"Calling: {t}")
            result = self.tools[t['name']].invoke(t['args'])
            results.append(ToolMessage(tool_call_id=t['id'], name=t['name'], content=str(result)))
        print("Back to the model!")
        return {'messages': results}
    
    # Invokes the current message chain
    def openai_inference(self, state: AgentState):
        messages = state['messages']
        if self.system:
            messages = [SystemMessage(content=self.system)] + messages
        message = self.model.invoke(messages)
        return {'messages': [message]}

In [21]:
prompt = """You are tasked with finding and storing the text content of a given restaurants Menu. 
The menu may be a PDF file that you will be required to scrape, or it will be text on a menu page. 
ONLY scrape menu data, this includes foods that the restaurant offers and DOES NOT include information about the restaurant itself or the menu iteself.
Use the search tool to find the url of the menu, if you cannot find the pdf url directly use the scrape_pdf tool to scrape webpages for the url.
If the menu isn't stored as a PDF, and is instead stored as text on a webpage, use the scrape_text tool to gather the text from the page. 
ONLY use the scrape_text tool if a pdf_menu is unavailable. DO NOT attempt to scrape blank menus. 
Once you have scraped the menu there are no more tasks to be done. 
"""

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
)

model = Model(llm, tools, system=prompt)

In [22]:
messages = ["CRISP Rochester"]
result = model.graph.invoke({"messages": messages})

Calling: {'name': 'tavily_search_results_json', 'args': {'query': 'CRISP Rochester menu'}, 'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'type': 'tool_call'}
Back to the model!
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Lunch-and-Dinner.pdf'}, 'id': 'call_ZGYGfnNSDHNNjRihcg50cgtc', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/menus/'}, 'id': 'call_sS2RXubzjhk8ZHDPXIc0nMii', 'type': 'tool_call'}
Back to the model!
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Lunch-and-Dinner.pdf'}, 'id': 'call_AyQFbL1S9TDzKvE1WjEIEGE2', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Brunch.pdf'}, 'id': 'call_QObR62V6Mqve6kPA9Se9YNNG', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Desserts.pdf'}, 'id': 'call_qZrmIr4zVhtThcl

In [23]:
result

{'messages': ['CRISP Rochester',
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'function': {'arguments': '{"query":"CRISP Rochester menu"}', 'name': 'tavily_search_results_json'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 327, 'total_tokens': 348}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_400f27fa1f', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-971f9afd-2e4f-40d5-9649-f0a04a2be010-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'CRISP Rochester menu'}, 'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'type': 'tool_call'}], usage_metadata={'input_tokens': 327, 'output_tokens': 21, 'total_tokens': 348}),
  ToolMessage(content="[{'url': 'https://crisprochester.com/menus/', 'content': 'CRISP Rochester menus. Lunch, dinner, cocktails, wine, beer, and brunch menus. Skip to content. 819 S. Clinton Ave. Rochester, NY. 14620 +1 (585) 978-

In [17]:
import pygraphviz
Image(model.graph.get_graph().draw_png())

ModuleNotFoundError: No module named 'pygraphviz'

# Datastore

In [None]:
# Tokenize the pdf 
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone()

pc.create_index(
    name="quickstart",
    dimension=5120, # Lamma 2 embedding dim
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)