# Setup

In [4]:
from dotenv import load_dotenv
env = load_dotenv()

In [5]:
from langgraph.graph import StateGraph, END
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_openai import ChatOpenAI
from IPython.display import Image
import IPython

# Tools

In [6]:
from typing import TypedDict, Annotated, Dict, Any
import operator

class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], operator.add]

In [7]:
# Download PDF tool
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.pydantic_v1 import BaseModel, Field
from bs4 import BeautifulSoup
import tiktoken
import requests

# Search for relevant menu
search_tool = TavilySearchResults()


# Scrape menu data
class ScrapeInput(BaseModel):
    url: str = Field(description="the URL of the menu page")

@tool("scrape_pdf", args_schema=ScrapeInput, return_direct=True)
def scrape_pdf(url: str):
    """Scrape a webpage that may include links to a restaurants current menu and return the links"""
    pdf_links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        pdf_links = [link.get('href') for link in links if link.get('href').endswith('.pdf')]
    except Exception as e:
        print(f"failed to scrape {url} ERROR: {e}")
    return pdf_links

@tool("scrape_text", args_schema=ScrapeInput, return_direct=True)
def scrape_text(url: str):
    """Scrape the text directly from a website"""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        # Get text
        text = soup.get_text(separator=' ')
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
    except Exception as e:
        print(f"failed to scrape {url} ERROR: {e}")
        return None
    return text

In [74]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Tokenize the pdf 
class tokenizerInput(BaseModel):
    pdf_url: str = Field(description="The url of the PDF file that must be tokenized")


@tool("tokenize-pdf", args_schema=tokenizerInput, return_direct=True)
def tokenize_pdf():
    """   """
    # Split the input text 
    text_splitter_semantic = SemanticChunker(
        OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
    )

    # Save to vector DB with formatted metada
    pass

In [None]:
# Text Splitters

In [9]:
%pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [89]:
from PyPDF2 import PdfReader

def read_pdf(file_path) -> str:
    output = []
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            for page in pdf_reader.pages:
                output.append(page.extract_text())

        return output
        
    except Exception as e:
        print(f"Error reading PDF file: {e}")

# Replace 'your_pdf_file.pdf' with the path to your PDF file
file = read_pdf('pdf/Mondo_Catering.pdf')

In [24]:
print(file)

['All orders must be placed at least 24 hrs in advance (allow 48 hrs for large pasta orders).\n Upon request, we can supply paper plates, napkins, forks and knives.\nMONDO 10 Main St. Middletown, CT - 860-343-3300  - mondokeith @gmail.comEGGPLANT FRIES   Half Tray. 24.95   Full Tray. 45\nHOUSE or CAESAR SALAD   1/2 Tray. 29.95 Full Tray. 59.95\nSHERRY, TOSCANO or ARUGULA SALAD  1/2 Tray 34.95 Full Tray 68.95\nMISTO PLATTER   Sliced Sopressata, prosciutto, marinated artichokes, \nfresh mozzarella, roasted peppers, olives and polenta croutons. 84.95\n*!/2 trays are priced to feed 10 people, Full trays feed 20\nBAKED ZITI   Half Tray. 29.95   w/Cheese. 34.95    Full Tray. 60    w/Cheese. 68.95\nMONDO MEATBALLS   2.95 per meatball  \nCHICKEN PARM   Half Tray (10 pieces). 45.95   Full Tray (20 pieces). 94.95\nBOLOGNESE   Our own slow cooked Bognese Half Tray. 54.95   Full Tray. 94.95\nBAKED CHICKEN PENNE / OR / VEGETABLE PENNE   \nHalf Tray of chicken. 45.95   Full Tray. 94.95     Half Tray

In [42]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter_recursive = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [61]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Split the input text 
text_splitter_semantic = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
)

In [67]:
split_text_sem = text_splitter_semantic.create_documents(file)

In [86]:
split_text_sem

[Document(page_content='All orders must be placed at least 24 hrs in advance (allow 48 hrs for large pasta orders). Upon request, we can supply paper plates, napkins, forks and knives.'),
 Document(page_content='MONDO 10 Main St. Middletown, CT - 860-343-3300  - mondokeith @gmail.comEGGPLANT FRIES   Half Tray. 24.95   Full Tray.'),
 Document(page_content='45\nHOUSE or CAESAR SALAD   1/2 Tray. 29.95 Full Tray. 59.95\nSHERRY, TOSCANO or ARUGULA SALAD  1/2 Tray 34.95 Full Tray 68.95\nMISTO PLATTER   Sliced Sopressata, prosciutto, marinated artichokes, \nfresh mozzarella, roasted peppers, olives and polenta croutons. 84.95\n*!/2 trays are priced to feed 10 people, Full trays feed 20\nBAKED ZITI   Half Tray. 29.95   w/Cheese. 34.95    Full Tray. 60    w/Cheese. 68.95\nMONDO MEATBALLS   2.95 per meatball  \nCHICKEN PARM   Half Tray (10 pieces). 45.95   Full Tray (20 pieces). 94.95\nBOLOGNESE   Our own slow cooked Bognese Half Tray. 54.95   Full Tray. 94.95\nBAKED CHICKEN PENNE / OR / VEGETAB

In [69]:
len(split_text_sem)

3

In [70]:
split_text_rec = text_splitter_recursive.create_documents(file)

In [87]:
split_text_rec

[Document(page_content='All orders must be placed at least 24 hrs in advance (allow 48 hrs for large pasta orders).\n Upon request, we can supply paper plates, napkins, forks and knives.'),
 Document(page_content='MONDO 10 Main St. Middletown, CT - 860-343-3300  - mondokeith @gmail.comEGGPLANT FRIES   Half Tray. 24.95   Full Tray. 45\nHOUSE or CAESAR SALAD   1/2 Tray. 29.95 Full Tray. 59.95'),
 Document(page_content='SHERRY, TOSCANO or ARUGULA SALAD  1/2 Tray 34.95 Full Tray 68.95\nMISTO PLATTER   Sliced Sopressata, prosciutto, marinated artichokes,'),
 Document(page_content='fresh mozzarella, roasted peppers, olives and polenta croutons. 84.95\n*!/2 trays are priced to feed 10 people, Full trays feed 20'),
 Document(page_content='BAKED ZITI   Half Tray. 29.95   w/Cheese. 34.95    Full Tray. 60    w/Cheese. 68.95\nMONDO MEATBALLS   2.95 per meatball  \nCHICKEN PARM   Half Tray (10 pieces). 45.95   Full Tray (20 pieces). 94.95'),
 Document(page_content='BOLOGNESE   Our own slow cooked B

In [72]:
len(split_text_rec)

9

In [90]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")

enc.encode(file[0])

[2594,
 12528,
 2804,
 413,
 12989,
 540,
 5153,
 220,
 1494,
 57225,
 306,
 14805,
 350,
 10180,
 220,
 3519,
 57225,
 395,
 4410,
 33847,
 12528,
 6294,
 46046,
 2616,
 11,
 581,
 665,
 10498,
 6651,
 36206,
 11,
 12739,
 23741,
 11,
 150664,
 326,
 90362,
 558,
 33899,
 11492,
 220,
 702,
 7956,
 901,
 13,
 75313,
 1347,
 940,
 11,
 24286,
 533,
 220,
 34216,
 12,
 28736,
 12,
 17845,
 15,
 220,
 533,
 20179,
 4718,
 437,
 759,
 159625,
 1136,
 21389,
 99563,
 4769,
 24595,
 9092,
 256,
 40150,
 124083,
 13,
 220,
 1494,
 13,
 4129,
 256,
 11689,
 124083,
 13,
 220,
 2548,
 198,
 161713,
 503,
 13180,
 1759,
 1312,
 58420,
 2416,
 256,
 220,
 16,
 14,
 17,
 124083,
 13,
 220,
 2270,
 13,
 4129,
 11689,
 124083,
 13,
 220,
 4621,
 13,
 4129,
 198,
 15403,
 132354,
 11,
 353,
 183060,
 63114,
 503,
 9232,
 6349,
 75328,
 58420,
 2416,
 220,
 220,
 16,
 14,
 17,
 124083,
 220,
 3020,
 13,
 4129,
 11689,
 124083,
 220,
 4625,
 13,
 4129,
 198,
 44,
 6258,
 46,
 14998,
 1228,
 6147,
 256

In [17]:
tools = [search_tool, scrape_pdf, scrape_text]

# RAG Model

In [18]:
class Model:
    # Define the model
    def __init__(self, model, tools, system=""):
        self.system = system
        graph = StateGraph(AgentState)
        graph.add_node("llm", self.openai_inference)
        graph.add_node("action", self.take_action)
        graph.add_conditional_edges(
            "llm",
            self.exists_action,
            {True: "action", False: END}
        )
        graph.add_edge("action", "llm")
        graph.set_entry_point("llm")
        self.graph = graph.compile()
        self.tools = {t.name: t for t in tools}
        self.model = model.bind_tools(tools)

    # Check if llm requires action 
    def exists_action(self, state: AgentState):
        result = state['messages'][-1]
        return len(result.tool_calls) > 0
    # Run a tool ordered by the model
    def take_action(self, state: AgentState):
        tool_calls = state['messages'][-1].tool_calls
        results = []
        for t in tool_calls:
            print(f"Calling: {t}")
            result = self.tools[t['name']].invoke(t['args'])
            results.append(ToolMessage(tool_call_id=t['id'], name=t['name'], content=str(result)))
        print("Back to the model!")
        return {'messages': results}
    
    # Invokes the current message chain
    def openai_inference(self, state: AgentState):
        messages = state['messages']
        if self.system:
            messages = [SystemMessage(content=self.system)] + messages
        message = self.model.invoke(messages)
        return {'messages': [message]}

In [21]:
prompt = """You are tasked with finding and storing the text content of a given restaurants Menu. 
The menu may be a PDF file that you will be required to scrape, or it will be text on a menu page. 
ONLY scrape menu data, this includes foods that the restaurant offers and DOES NOT include information about the restaurant itself or the menu iteself.
Use the search tool to find the url of the menu, if you cannot find the pdf url directly use the scrape_pdf tool to scrape webpages for the url.
If the menu isn't stored as a PDF, and is instead stored as text on a webpage, use the scrape_text tool to gather the text from the page. 
ONLY use the scrape_text tool if a pdf_menu is unavailable. DO NOT attempt to scrape blank menus. 
Once you have scraped the menu there are no more tasks to be done. 
"""

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
)

model = Model(llm, tools, system=prompt)

In [22]:
messages = ["CRISP Rochester"]
result = model.graph.invoke({"messages": messages})

Calling: {'name': 'tavily_search_results_json', 'args': {'query': 'CRISP Rochester menu'}, 'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'type': 'tool_call'}
Back to the model!
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Lunch-and-Dinner.pdf'}, 'id': 'call_ZGYGfnNSDHNNjRihcg50cgtc', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/menus/'}, 'id': 'call_sS2RXubzjhk8ZHDPXIc0nMii', 'type': 'tool_call'}
Back to the model!
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Lunch-and-Dinner.pdf'}, 'id': 'call_AyQFbL1S9TDzKvE1WjEIEGE2', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Brunch.pdf'}, 'id': 'call_QObR62V6Mqve6kPA9Se9YNNG', 'type': 'tool_call'}
Calling: {'name': 'scrape_pdf', 'args': {'url': 'https://crisprochester.com/wp-content/uploads/Desserts.pdf'}, 'id': 'call_qZrmIr4zVhtThcl

In [23]:
result

{'messages': ['CRISP Rochester',
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'function': {'arguments': '{"query":"CRISP Rochester menu"}', 'name': 'tavily_search_results_json'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 327, 'total_tokens': 348}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_400f27fa1f', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-971f9afd-2e4f-40d5-9649-f0a04a2be010-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'CRISP Rochester menu'}, 'id': 'call_EqleKnWd1aZvVD5j327kYkOM', 'type': 'tool_call'}], usage_metadata={'input_tokens': 327, 'output_tokens': 21, 'total_tokens': 348}),
  ToolMessage(content="[{'url': 'https://crisprochester.com/menus/', 'content': 'CRISP Rochester menus. Lunch, dinner, cocktails, wine, beer, and brunch menus. Skip to content. 819 S. Clinton Ave. Rochester, NY. 14620 +1 (585) 978-

In [17]:
import pygraphviz
Image(model.graph.get_graph().draw_png())

ModuleNotFoundError: No module named 'pygraphviz'

# Datastore

In [None]:
# Tokenize the pdf 
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone()

pc.create_index(
    name="quickstart",
    dimension=5120, # Lamma 2 embedding dim
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)