In [None]:
!pip install tiktoken
!pip install kagglehub
!pip install openai
!pip install faiss-cpu

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m48.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: 

In [None]:
import pandas as pd #library for dataframes
import tiktoken #library to estimate tokens used for each character

import faiss
# a library that allows developers to quickly search for embeddings of
# multimedia documents that are similar to each other. Can use any kind of Vector DB
# only used for demo purposes

from openai import OpenAI
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#never share your keys publicly. always load them using .env files. Directly assignment here is for demo purposes ONLY
OPENAI_API_KEY = "XXX"
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
embedding_encoding = "cl100k_base"
max_tokens = 5000  # the maximum for text-embedding-3-small is 8191

# load & inspect dataset
path = "/content/drive/My Drive/Colab Notebooks/AISC/Reviews.csv" #adjust to your own directory
reviews_df = pd.read_csv(path, index_col=0)

encoding = tiktoken.get_encoding(embedding_encoding) #get token usage estimate
top_n = 50 #Get 50 most recent reviews

reviews_df = reviews_df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
reviews_df = reviews_df.dropna()
reviews_df["combined"] = (
    "Title: " + reviews_df.Summary.str.strip() + "; Content: " + reviews_df.Text.str.strip()
)

# omit reviews that are too long to embed - OPTIONAL
reviews_df["n_tokens"] = reviews_df.combined.apply(lambda x: len(encoding.encode(x)))
reviews_df = reviews_df[reviews_df.n_tokens <= max_tokens].tail(top_n) # returns the last top_n rows of the DataFrame,

In [None]:
print(reviews_df.shape)
reviews_df.head()

(50, 8)


Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined,n_tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
568405,1350172800,B005OTVL8C,A1UJMVE0LCOI45,5,great flavor boost,If you want to add a hint of Meyer lemon flavo...,Title: great flavor boost; Content: If you wan...,86
568406,1274918400,B000P56I7Y,A2ZKNGL20FBK2I,5,Arrived Fine,The product arrived pretty quickly and was eas...,Title: Arrived Fine; Content: The product arri...,42
568407,1336867200,B0039KE8Y2,A392FNHBJR1IY6,4,great taste,This apple butter has a great taste but the pr...,Title: great taste; Content: This apple butter...,39
568408,1291420800,B0018CLWM4,ANKM1RMQ4RKQ6,5,Premium Edge Dry Cat Food for Kitten,My 6 month old male Tuxedo cat likes Premium E...,Title: Premium Edge Dry Cat Food for Kitten; C...,170
568409,1310515200,B0018CLWM4,AJGOF4W50ZNB4,5,Premium Edge Kitten Food,This is a good food with decent ingredients & ...,Title: Premium Edge Kitten Food; Content: This...,82


In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
# Generate embeddings using OpenAI
def get_embedding(text, model="text-embedding-3-small"):
    embeddings = client.embeddings.create(
        input = [text],
        model=model
    )
    return embeddings.data[0].embedding

reviews_df['ada_embedding'] = reviews_df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
reviews_df.head()

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined,n_tokens,ada_embedding
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
568405,1350172800,B005OTVL8C,A1UJMVE0LCOI45,5,great flavor boost,If you want to add a hint of Meyer lemon flavo...,Title: great flavor boost; Content: If you wan...,86,"[0.025893114507198334, 0.0037613348104059696, ..."
568406,1274918400,B000P56I7Y,A2ZKNGL20FBK2I,5,Arrived Fine,The product arrived pretty quickly and was eas...,Title: Arrived Fine; Content: The product arri...,42,"[-0.004842213820666075, 0.003409644588828087, ..."
568407,1336867200,B0039KE8Y2,A392FNHBJR1IY6,4,great taste,This apple butter has a great taste but the pr...,Title: great taste; Content: This apple butter...,39,"[0.0013334699906408787, -0.025772497057914734,..."
568408,1291420800,B0018CLWM4,ANKM1RMQ4RKQ6,5,Premium Edge Dry Cat Food for Kitten,My 6 month old male Tuxedo cat likes Premium E...,Title: Premium Edge Dry Cat Food for Kitten; C...,170,"[0.014264424331486225, -0.030544856563210487, ..."
568409,1310515200,B0018CLWM4,AJGOF4W50ZNB4,5,Premium Edge Kitten Food,This is a good food with decent ingredients & ...,Title: Premium Edge Kitten Food; Content: This...,82,"[0.0008219669107347727, -0.03714308142662048, ..."


In [None]:
# Load the DataFrame (if not already loaded in memory)
reviews_df["embedding"] = reviews_df["ada_embedding"].apply(np.array)

# Step 1: Create FAISS index
embedding_dim = len(reviews_df["embedding"].iloc[0])
index = faiss.IndexFlatL2(embedding_dim)  # or use IndexFlatIP for cosine similarity with normalized vectors

# Step 2: Add embeddings to the index
embeddings = np.vstack(reviews_df["embedding"].values).astype("float32")
faiss.normalize_L2(embeddings)  # normalize vectors to unit length if using cosine similarity
index.add(embeddings)

# Step 3: Define a search function
def search_reviews(query_text, k=5):
    #embedds the querry input to find k similar sources
    query_vec = np.array(get_embedding(query_text), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(query_vec) # normalize vectors to unit length if using cosine similarity

    distances, indices = index.search(query_vec, k)
    results = reviews_df.iloc[indices[0]][["combined", "embedding"]]
    return results


# 🔍 Example query
query = "This food tastes great and the texture is perfect"
results = search_reviews(query)
print(results["combined"].to_string(index=False))

Id
Title: Great; Content: This is the best brand o...
Title: Will not do without; Content: Great for ...
Title: Great Cafe Latte; Content: This product ...
Title: Exotic food; Content: This is always a g...
Title: Great For Fast Gulasch!; Content: Quick ...


In [None]:
def generate_response_with_rag(query, k=5, model="gpt-3.5-turbo"):
    # Step 1: Retrieve relevant reviews
    relevant_docs = search_reviews(query, k=k)
    context = "\n---\n".join(relevant_docs["combined"].tolist())

    # Step 2: Format the prompt
    prompt = f"""
			You are a helpful assistant that answers user queries based on customer food reviews.

			Use the following context (customer reviews) to answer the question. If the context does not contain the answer, say you don't know.

			Context:
			{context}

			Question: {query}
			Answer:

			"""

    # Step 3: Call OpenAI Chat Completion API
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=300 #set according to your needs
    )

    return response.choices[0].message.content

# 🔍 Example usage
user_query = "Are customers happy with the food's texture?"
answer = generate_response_with_rag(user_query)
print("Q:", user_query)
print("A:", answer)


Q: Are customers happy with the food's texture?
A: Based on the customer reviews provided, it seems that customers are generally happy with the food's texture. The first review mentions that the kibble size was reasonable and easy on a sensitive stomach. The second review mentions that the customer loves hearing the kittens crunching on the dry food. The fourth review mentions that the product tasted very good. Overall, it appears that customers are satisfied with the texture of the food.
