In [4]:
!pip install anthropic
!pip install faiss-cpu
!pip install tiktoken
!pip install transformers==4.31.0
!pip install -U sentence-transformers

Collecting transformers==4.31.0
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Using cached tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Using cached tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

# Loading Data Into Project And Prep

In [41]:
import pandas as pd #library for dataframes
import tiktoken #library to estimate tokens used for each character

import faiss
# a library that allows developers to quickly search for embeddings of
# multimedia documents that are similar to each other. Can use any kind of Vector DB
# only used for demo purposes

from anthropic import Anthropic
import numpy as np


In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
#never share your keys publicly. always load them using .env files. Directly assignment here is for demo purposes ONLY
ANTHROPIC_API_KEY = "XXX"
client_ant = Anthropic(
    api_key=ANTHROPIC_API_KEY,
)

In [44]:
embedding_encoding = "cl100k_base"
max_tokens = 5000

# load & inspect dataset
path = "/content/drive/My Drive/Colab Notebooks/AISC/Reviews.csv" #adjust to your own directory
reviews_df = pd.read_csv(path, index_col=0)

encoding = tiktoken.get_encoding(embedding_encoding) #get token usage estimate
top_n = 50 #Get 50 most recent reviews

reviews_df = reviews_df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
reviews_df = reviews_df.dropna()
reviews_df = reviews_df[:top_n]
reviews_df["combined"] = (
    "Title: " + reviews_df.Summary.str.strip() + "; Content: " + reviews_df.Text.str.strip()
)

# omit reviews that are too long to embed - OPTIONAL
reviews_df["n_tokens"] = reviews_df.combined.apply(lambda x: len(encoding.encode(x)))
reviews_df = reviews_df[reviews_df.n_tokens <= max_tokens]

In [45]:
print(reviews_df.shape)
reviews_df.head()

(50, 8)


Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined,n_tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...,64
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...,51
3,1219017600,B000LQOCH0,ABXLMWJIXXAIN,4,"""Delight"" says it all",This is a confection that has been around a fe...,"Title: ""Delight"" says it all; Content: This is...",137
4,1307923200,B000UA0QIQ,A395BORC6FGVXV,2,Cough Medicine,If you are looking for the secret ingredient i...,Title: Cough Medicine; Content: If you are loo...,59
5,1350777600,B006K2ZZ7K,A1UQRSCLF8GW1T,5,Great taffy,Great taffy at a great price. There was a wid...,Title: Great taffy; Content: Great taffy at a ...,50


# Converting Data Into Embeddings And Push Into Vector DB

In [46]:
# Initialize the SentenceTransformer model - embedding model
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding_st(text):
    return sentence_model.encode(text)

reviews_df['ada_embedding'] = reviews_df.combined.apply(lambda x: get_embedding_st(x))

In [47]:
reviews_df.head()

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined,n_tokens,ada_embedding
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...,64,"[-0.049665734, -0.049002055, 0.06873689, -0.00..."
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...,51,"[0.013913844, 0.06850058, 0.01119179, -0.00096..."
3,1219017600,B000LQOCH0,ABXLMWJIXXAIN,4,"""Delight"" says it all",This is a confection that has been around a fe...,"Title: ""Delight"" says it all; Content: This is...",137,"[-0.0743014, 0.0072622425, -0.043986604, 0.074..."
4,1307923200,B000UA0QIQ,A395BORC6FGVXV,2,Cough Medicine,If you are looking for the secret ingredient i...,Title: Cough Medicine; Content: If you are loo...,59,"[-0.013282064, -0.013121256, -0.08531106, 0.00..."
5,1350777600,B006K2ZZ7K,A1UQRSCLF8GW1T,5,Great taffy,Great taffy at a great price. There was a wid...,Title: Great taffy; Content: Great taffy at a ...,50,"[-0.034084786, 0.03781792, 0.108720414, -0.007..."


In [48]:
from sentence_transformers import util

# Load the DataFrame (if not already loaded in memory)
reviews_df["embedding"] = reviews_df["ada_embedding"].apply(np.array)

# Step 1: Create FAISS index
embedding_dim = len(reviews_df["embedding"].iloc[0])
index = faiss.IndexFlatL2(embedding_dim)  # or use IndexFlatIP for cosine similarity with normalized vectors

# Step 2: Add embeddings to the index
embeddings = np.vstack(reviews_df["embedding"].values).astype("float32")
faiss.normalize_L2(embeddings)  # normalize vectors to unit length if using cosine similarity
index.add(embeddings)


def search_reviews_st(query_text, k=5):
    """Searches reviews using SentenceTransformer embeddings."""
    query_vec = get_embedding_st(query_text)
    query_vec = query_vec.reshape(1, -1)
    faiss.normalize_L2(query_vec) # normalizse vectors to unit length if using cosine similarity

    similarities = util.cos_sim(query_vec, np.vstack(reviews_df["embedding"].values)).cpu().numpy()[0]
    indices = similarities.argsort()[-k:][::-1]

    results = reviews_df.iloc[indices][["combined", "embedding"]]
    return results


# 🔍 Example query
query = "This food tastes great and the texture is perfect"
results = search_reviews_st(query)
print(results["combined"].to_string(index=False))

Id
Title: Good Quality Dog Food; Content: I have b...
Title: Wonderful, tasty taffy; Content: This ta...
Title: good; Content: Good oatmeal.  I like the...
Title: Healthy Dog Food; Content: This is a ver...
Title: Very good but next time I won't order th...


# Defining Helper Functions For Agentic RAG

In [65]:
def search_reviews_st(query_text, k=5):
    """Searches reviews using SentenceTransformer embeddings."""
    query_vec = get_embedding_st(query_text)
    query_vec = query_vec.reshape(1, -1)
    faiss.normalize_L2(query_vec) # normalizse vectors to unit length if using cosine similarity

    similarities = util.cos_sim(query_vec, np.vstack(reviews_df["embedding"].values)).cpu().numpy()[0]
    indices = similarities.argsort()[-k:][::-1]

    results = reviews_df.iloc[indices][["combined", "embedding"]]
    return results


# Define the tools
tools = [
    {
        "name": "search_similar_products",
        "description": "Search for products with similar reviews regarding a specific feature",
        "input_schema": {
            "type": "object",
            "properties": {
                "product_name": {
                    "type": "string",
                    "description": "Name of the product extracted from the reviews"
                },
                "feature": {
                    "type": "string",
                    "description": "The specific feature to compare (e.g., texture, flavor)"
                }
            },
            "required": ["product_name", "feature"]
        }
    }
]

def is_query_in_context(query):
    """
    Determines if a query is related to food reviews and products.
    Returns True if the query is in context, False otherwise.
    """
    # List of food and review related keywords
    food_keywords = [
        "food", "taste", "flavor", "texture", "spicy", "sweet", "sour",
        "delicious", "restaurant", "meal", "dish", "recipe", "ingredient",
        "snack", "drink", "beverage", "review", "rating", "opinion", "customer",
        "satisfaction", "quality", "price", "value", "packaging", "product"
    ]

    # Convert query to lowercase for case-insensitive matching
    query_lower = query.lower()

    # Check if any food-related keyword is in the query
    for keyword in food_keywords:
        if keyword in query_lower:
            return True

    return False

def process_tool_call(tool_name, tool_input):
    """Process tool calls based on the tool name"""
    if tool_name == "search_similar_products":
        result = web_search_similar_products(
            tool_input["product_name"],
            tool_input["feature"]
        )
        # Convert the result to a string before returning
        return "\n".join(result["similar_products"]) # Convert the list to a string
    return "Tool not found"

def web_search_similar_products(product_name, feature):
    """
    Simulates a web search for products with similar reviews.
    In a real implementation, this would connect to a search API.
    """
    print(f"Searching for products similar to {product_name} with comparable {feature}...")

    # Mock results - in a real implementation, this would come from the search API
    mock_results = {
        "crunchy snacks": [
            {"name": "PopCorners Chips", "description": "Similar crunch with less oil"},
            {"name": "Harvest Snaps", "description": "Plant-based with satisfying crunch"},
            {"name": "Popcorn Indiana", "description": "Light and airy texture"}
        ],
        "creamy desserts": [
            {"name": "Noosa Yogurt", "description": "Rich and smooth texture"},
            {"name": "Talenti Gelato", "description": "Silky smooth without iciness"},
            {"name": "Halo Top", "description": "Lighter but still creamy"}
        ],
        "spicy sauce": [
            {"name": "Cholula", "description": "Balanced heat with complex flavor"},
            {"name": "Tapatio", "description": "Consistent heat level"},
            {"name": "Sriracha", "description": "Sweet heat with garlic notes"}
        ]
    }

    # Determine which category might fit the product_name and feature
    category = "crunchy snacks"  # default
    if "dessert" in product_name.lower() or "cream" in product_name.lower() or "yogurt" in product_name.lower():
        category = "creamy desserts"
    elif "sauce" in product_name.lower() or "hot" in product_name.lower():
        category = "spicy sauce"

    # Format results as a string
    results = []
    for product in mock_results[category]:
        results.append(f"{product['name']}: {product['description']}")

    return {"similar_products": results}


# RAG Query Response

In [73]:
def generate_response_with_rag(query, k=5):
    """Generate responses using RAG with function calling capability"""

    # Check if query is in context first
    if not is_query_in_context(query):
        print("I'm sorry, but your question appears to be outside the scope of our food review database. Please ask a question related to food reviews or products.")
        return None

    # Retrieve relevant documents
    relevant_docs = search_reviews_st(query, k=k)
    context = "\n---\n".join(relevant_docs["combined"].tolist())

    print('\nRelevant Documents Retrieved From RAG search:')
    print(context)

    # Format the system prompt
    system_prompt = f"""
    You are a helpful assistant that answers user queries based on customer food reviews.

    Use the following context (customer reviews) to answer the question. If the context does not contain the answer, say you don't know.

    Context:
    {context}

    If the user asks about similar products or alternatives with specific characteristics, use the search_similar_products() function.
    """

    # Call Claude with tool definition
    message = client_ant.messages.create(
        model="claude-3-7-sonnet-20250219",
        system=system_prompt,
        max_tokens=300,
        messages=[{"role": "user", "content": query}],
        tools=tools,
        temperature=0.3
    )

    print(f"\nInitial Response:")
    print(f"Stop Reason: {message.stop_reason}")
    print(f"Initial Content: {message.content}")

    # Handle tool calls if needed
    if message.stop_reason == "tool_use":
        # Extract the tool use information
        tool_use = next(block for block in message.content if block.type == "tool_use")
        tool_name = tool_use.name
        tool_input = tool_use.input

        print(f"\nTool Used: {tool_name}")
        print(f"Tool Input: {tool_input}")

        # Process the tool call
        tool_result = process_tool_call(tool_name, tool_input)
        print(f"Tool Result: {tool_result}")

        # Create a follow-up message with the tool result
        response = client_ant.messages.create(
            model="claude-3-7-sonnet-20250219",
            system=system_prompt,
            max_tokens=300,
            messages=[
                {"role": "user", "content": query},
                {"role": "assistant", "content": message.content},
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "tool_result",
                            "tool_use_id": tool_use.id,
                            "content": tool_result,
                        }
                    ],
                },
            ],
            tools=tools,
            temperature=0.3
        )
    else:
        # No tool call, use the original response
        response = message

    # Extract the final text response
    final_response = next(
        (block.text for block in response.content if hasattr(block, "text")),
        None,
    )

    print("------------------------------------")
    print(f"\nFinal Response: {final_response}")
    return final_response

In [69]:
# 🔍 Example usage
user_query = "Are customers happy with the texture of the taffy. Find me products that are desserts with the same texture?"
model = "claude-3-7-sonnet-20250219"
answer = generate_response_with_rag(user_query)


Relevant Documents Retrieved From RAG search:
Title: Wonderful, tasty taffy; Content: This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!
---
Title: Nice Taffy; Content: I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there was a bit too much red/black licorice-flavored pieces (just not my particular favorites). Between me, my kids, and my husband, this lasted only two weeks! I would recommend this brand of taffy -- it was a delightful treat.
---
Title: Great!  Just as good as the expensive brands!; Content: This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served i

In [74]:
# 🔍 Example usage
user_query = "Who won the FIFA world cup"
model = "claude-3-7-sonnet-20250219"
answer = generate_response_with_rag(user_query)

I'm sorry, but your question appears to be outside the scope of our food review database. Please ask a question related to food reviews or products.
