# Load and Preprocess Food Reviews Dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import faiss

# Load the dataset
df = pd.read_csv("food_reviews_1k.csv")
 
# Display the first few rows of the dataset
df.head()

Unnamed: 0,ProductId,Score,Summary,Text
0,B001E4KFG0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
# Drop rows with missing 'Text' values
df.dropna(subset=['Text'], inplace=True)
 
# Convert text to lowercase and remove punctuation
df['processed_text'] = df['Text'].str.lower().str.replace('[^\w\s]', '', regex=True)
 
# Display the first few rows to verify preprocessing
df.head()

Unnamed: 0,ProductId,Score,Summary,Text,processed_text
0,B001E4KFG0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...
2,B000LQOCH0,4,"""Delight"" says it all",This is a confection that has been around a fe...,this is a confection that has been around a fe...
3,B000UA0QIQ,2,Cough Medicine,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy,Great taffy at a great price. There was a wid...,great taffy at a great price there was a wide...


# Vectorize Text Data and Build a FAISS Index for Similar Product Retrieval

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the review texts
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['processed_text']).toarray()

Create a FAISS index to enable efficient similarity searches based on the vectorized text data. The index will use L2 distance (Euclidean distance) to measure similarity.

In [4]:
# Build the FAISS index
index = faiss.IndexFlatL2(X.shape[1])
index.add(X)

Implement a function retrieve_similar_products(query, k=5) that takes a query string and an optional parameter k (default is 5), vectorizes the query, searches the FAISS index, and returns the most similar products from the dataset.

In [6]:
# Function to retrieve similar products
def retrieve_similar_products(query, k=5):
    """
    Retrieve the top k most similar products to the given query.
 
    Parameters:
    - query (str): The query string for which similar products are to be retrieved.
    - k (int, optional): The number of similar products to retrieve. Default is 5.
 
    Returns:
    - DataFrame: A DataFrame containing the top k most similar products.
    """
    query_vec = vectorizer.transform([query]).toarray()
    distances, indices = index.search(query_vec, k)
    return df.iloc[indices[0]]

In [7]:
# Test the retriever
query = "great quality ham"
retrieve_similar_products(query)

Unnamed: 0,ProductId,Score,Summary,Text,processed_text
392,B001FA1MCO,5,Ham Base,This is a great ham soup base. I have used it...,this is a great ham soup base i have used it ...
393,B001FA1MCO,1,MSG Ham Base,I haven't used the ham base. It is loaded with...,i havent used the ham base it is loaded with m...
852,B0007NG568,5,"Good oatmeal, I'm on my second bag",I'm happy with the quality of the product and ...,im happy with the quality of the product and t...
184,B001KUUNP6,5,Garbonzo Bean Flour,This is great stuff. Made some really tasty ...,this is great stuff made some really tasty b...
113,B0037LW78C,5,the best tea ever... freah bright clean,this has to be one of the best teas I have eve...,this has to be one of the best teas i have eve...


In [9]:
# Example usage
query = "Best Cheese"
similar_products = retrieve_similar_products(query)
similar_products

Unnamed: 0,ProductId,Score,Summary,Text,processed_text
619,B000G6RYNE,5,Delicious!,"At first I was thinking, ""cheese flavored chip...",at first i was thinking cheese flavored chips ...
747,B000QWXG9O,3,Annie's Homegrown Organic Whole Wheat Shells &...,This product is made by Annie's Inc. in Berkel...,this product is made by annies inc in berkely ...
415,B005WU7V00,5,VERY GOOD! Great taste and easy for a single guy!,I was surprised about how good this was my fam...,i was surprised about how good this was my fam...
610,B000G6RYNE,5,Caution: Kettle Chips are addictive!,These really are amazing chips. Don't be put o...,these really are amazing chips dont be put off...
440,B000G6RYNE,3,"Honey Dijon leaves bad aftertaste, NY Cheddar ...","Honey Dijon flavor: okay flavor, but both my g...",honey dijon flavor okay flavor but both my gir...


# Access the performance of the Product Retrieval Function

Assess the performance of the product retrieval function by evaluating the relevance of the retrieved products based on a user-provided query. This involves implementing a function to display the retrieved products, prompting the user to rate their relevance, and calculating an average relevance score. 

The steps include:

    Implementing a function to retrieve and display similar products.

    Prompting the user to assess the relevance of the retrieved products.

    Calculating and displaying the average relevance score.

In [10]:
def assess_retrieval_function(query, k=5):
    """
    Assess the performance of the product retrieval function by evaluating the relevance of the retrieved products.
 
    Parameters:
    - query (str): The query string for which similar products are to be retrieved.
    - k (int, optional): The number of similar products to retrieve and assess. Default is 5.
 
    Returns:
    - dict: A dictionary containing the query and the average relevance score.
    """
    # Retrieve similar products
    similar_products = retrieve_similar_products(query, k)
    
    # Display the top k products with their summaries and texts
    print(f"Query: {query}\n")
    for i, row in similar_products.iterrows():
        print(f"Product {i+1}:")
        print(f"Summary: {row['Summary']}")
        print(f"Text: {row['processed_text']}\n")
    
    # Initialize list to store relevance scores
    relevance_scores = []
    
    # Prompt user for relevance assessment
    for i in range(k):
        relevance = int(input(f"Relevance for Product {i+1} (1-5): "))
        relevance_scores.append(relevance)
    
    # Calculate average relevance score
    avg_relevance = sum(relevance_scores) / k
    
    # Print average relevance score
    print(f"\nAssessment Results for Query '{query}':")
    print(f"Average Relevance: {avg_relevance:.2f}")
    
    # Return the average relevance score
    return {
        "query": query,
        "average_relevance": avg_relevance
    }

In [11]:
# Example usage
query = "great quality ham"
assessment_results = assess_retrieval_function(query)
print(assessment_results)

Query: great quality ham

Product 393:
Summary: Ham Base
Text: this is a great ham soup base  i have used it in ham and beans and seasoned greenbeans  great flavor

Product 394:
Summary: MSG Ham Base
Text: i havent used the ham base it is loaded with msg i did not realize this when i ordered it and can not return it because it is a food item

Product 853:
Summary: Good oatmeal, I'm on my second bag
Text: im happy with the quality of the product and the price like the other reviewer i would prefer if there was a plastic liner to preserve freshness however i will continue to buy this product regardless as it is quality oatmeal at a good pricebr br edit im on my 4th bag quality continues to be high

Product 185:
Summary: Garbonzo  Bean Flour
Text: this  is great stuff  made some really tasty banana bread  good quality and lowest price in town

Product 114:
Summary: the best tea ever... freah bright clean
Text: this has to be one of the best teas i have ever tasted its clean bright freshbr

Relevance for Product 1 (1-5):  2
Relevance for Product 2 (1-5):  3
Relevance for Product 3 (1-5):  0
Relevance for Product 4 (1-5):  0
Relevance for Product 5 (1-5):  0



Assessment Results for Query 'great quality ham':
Average Relevance: 1.00
{'query': 'great quality ham', 'average_relevance': 1.0}


# Advanced Text Preprocessing and Building a FAISS Index for Similar Product Retrieval

Import necessary libraries and download required NLTK resources: punkt, sporwords and wordnet.

Define and apply a preprocessing function to clean and lemmatize the text.

Vectorize the review texts using TF-IDF with bigrams, and normalize the vectors:

Build a FAISS index using the normalized vectors.

Implement a function to retrieve similar products based on a query.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/labsuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/labsuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/labsuser/nltk_data...


True

In [13]:
# Advanced Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
 
def preprocess_text(text):
    # Tokenize
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    words = [word for word in words if word.isalpha() and word not in stop_words]
    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)
 
# Apply preprocessing to the dataframe
df['processed_text'] = df['processed_text'].apply(preprocess_text)

In [14]:
# Vectorize the review texts with enhanced settings
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=2, max_df=0.8)
X = vectorizer.fit_transform(df['processed_text']).toarray()

In [15]:
# Normalize the vectors
X = normalize(X)

In [16]:
# Build the FAISS index
dimension = X.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(X)

In [17]:
# Function to retrieve similar products
def retrieve_similar_products(query, k=5):
    """
    Retrieve the top k most similar products to the given query.
 
    Parameters:
    - query (str): The query string for which similar products are to be retrieved.
    - k (int, optional): The number of similar products to retrieve. Default is 5.
 
    Returns:
    - DataFrame: A DataFrame containing the top k most similar products.
    """
    query_processed = preprocess_text(query)
    query_vec = vectorizer.transform([query_processed]).toarray()
    query_vec = normalize(query_vec)
    distances, indices = index.search(query_vec, k)
    return df.iloc[indices[0]]

# Test and Assess the improved Product Retrieval Function

In [18]:
# Test the retriever with an example query "great quality ham"
query = "great quality ham"
similar_products = retrieve_similar_products(query)
 
# Display the summary and processed text of the retrieved products
print(similar_products[['Summary', 'processed_text']])

                           Summary  \
392                       Ham Base   
338            Tasty, tasty tasty!   
631         Love the smaller bags!   
393                   MSG Ham Base   
983  Best matcha quality and price   

                                        processed_text  
392  great ham soup base used ham bean seasoned gre...  
338  like lot sesame oil use salad regularly great ...  
631  yo kettle chip addict happy find snack size ba...  
393  havent used ham base loaded msg realize ordere...  
983  previously tried matcha product generally sati...  


In [19]:
# Assess the retrieval function for the query "great quality ham"
query = "great quality ham"
assessment_results = assess_retrieval_function(query)
 
# Print the assessment results
print(assessment_results)

Query: great quality ham

Product 393:
Summary: Ham Base
Text: great ham soup base used ham bean seasoned greenbeans great flavor

Product 339:
Summary: Tasty, tasty tasty!
Text: like lot sesame oil use salad regularly great quality flavor aroma cant beat organic

Product 632:
Summary: Love the smaller bags!
Text: yo kettle chip addict happy find snack size bag lunch great quality much cheaper store definitely buying

Product 394:
Summary: MSG Ham Base
Text: havent used ham base loaded msg realize ordered return food item

Product 984:
Summary: Best matcha quality and price
Text: previously tried matcha product generally satisfied quality price decided give pure matcha green rooibos try price seemed competitive quality pure matchas product surpasses others tried stay fresh consistent texture taste great quality price better others purchased quantity highly recommend pure matcha continue purchase



Relevance for Product 1 (1-5):  5
Relevance for Product 2 (1-5):  0
Relevance for Product 3 (1-5):  0
Relevance for Product 4 (1-5):  3
Relevance for Product 5 (1-5):  0



Assessment Results for Query 'great quality ham':
Average Relevance: 1.60
{'query': 'great quality ham', 'average_relevance': 1.6}


# Generate product descriptions from Multiple Reviews Using OpenAI's GPT-3.5 Turbo

In [20]:
from openai import OpenAI
 
client = OpenAI()

In [21]:
def generate_description_from_reviews(reviews):
    """
    Generate a product description based on multiple reviews.
 
    Parameters:
    - reviews (list): A list of review strings.
 
    Returns:
    - str: A generated product description.
    """
    combined_reviews = " ".join(reviews)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Generate a product description based on the following reviews: {combined_reviews}"}
        ],
        max_tokens=400,
        n=1,
        stop=None,
        temperature=0.7,
    )
    return response.choices[0].message.content

In [22]:
# Test the function with a sample review
sample_reviews = ["The cheesiest cheese I have ever found."]
description = generate_description_from_reviews(sample_reviews)
print(description)

Introducing our newest product: Super Cheesy Cheese! Prepare to indulge in the cheesiest cheese you have ever tasted. Perfect for cheese lovers who can't get enough of that rich, gooey goodness. Add a burst of flavor to your favorite dishes with this irresistible cheese. Get ready to elevate your meals to a whole new level of cheesiness with our Super Cheesy Cheese!


# Generate Product Recommendations Using a Retrieval Augmented Generation (RAG) System

Define a function to generate product recommendations using a Retrieval-Augmented Generation (RAG) system. This involves retrieving similar products based on a query, extracting their reviews, and generating a combined product description using OpenAI's GPT-3.5 Turbo model. 

In [23]:
def generate_recommendation(query, k=5):
    """
    Generate a product recommendation based on a query using a RAG system.
 
    Parameters:
    - query (str): The query string for retrieving similar products.
    - k (int, optional): The number of similar products to retrieve. Default is 5.
 
    Returns:
    - str: A generated product description based on the retrieved reviews.
    """
    # Retrieve similar products
    similar_products = retrieve_similar_products(query, k)
    # Extract the reviews of the retrieved products
    reviews = similar_products['processed_text'].tolist()
    # Generate a combined product description from the reviews
    description = generate_description_from_reviews(reviews)
    return description

In [24]:
# Test the RAG system with an example query "high quality ham"
query = "high quality ham"
recommendations = generate_recommendation(query)
print(recommendations)

Introducing our premium Ham and Bean Soup Base, a must-have for every kitchen! Made with seasoned green beans and a flavorful blend of high-quality ingredients, this soup base promises a delicious and hearty meal every time. Our customers rave about the rich flavor and top-notch quality of this product, making it a surefire hit at any party or gathering.

Not only is our Ham and Bean Soup Base perfect for creating a comforting and satisfying soup, but it also serves as a versatile ingredient. Use it to enhance the taste of your favorite dishes, from stews to casseroles. With a hint of chocolate liquor and a touch of lemon juice, this soup base adds a unique twist to your culinary creations.

Customers love the convenience and value of our product, noting that it's a great buy for its quality and taste. Whether you're a seasoned chef or a novice cook, our Ham and Bean Soup Base is sure to impress. So why wait? Elevate your meals with our exceptional soup base and experience the differen

# Evaluate Product Recommendations Using Bleu and Rouge Scores

In [25]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

def calculate_bleu(reference, candidate):
    """
    Calculate BLEU score with smoothing.
 
    Parameters:
    - reference (str): The ground truth description.
    - candidate (str): The generated description.
 
    Returns:
    - float: The BLEU score.
    """
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)
    smoothie = SmoothingFunction().method4  # Using method4 for smoothing
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)

In [26]:
def calculate_rouge(reference, candidate):
    """
    Calculate ROUGE score.
 
    Parameters:
    - reference (str): The ground truth description.
    - candidate (str): The generated description.
 
    Returns:
    - dict: A dictionary containing ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

In [27]:
def evaluate_recommendations(recommendations, ground_truths):
    """
    Evaluate the generated recommendations using BLEU and ROUGE scores.
 
    Parameters:
    - recommendations (list): A list of generated product descriptions.
    - ground_truths (list): A list of ground truth product descriptions.
 
    Returns:
    - tuple: A tuple containing the average BLEU score and a dictionary of average ROUGE scores.
    """
    bleu_scores = []
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for rec, gt in zip(recommendations, ground_truths):
        bleu_scores.append(calculate_bleu(gt, rec))
        
        rouge = calculate_rouge(gt, rec)
        rouge_scores['rouge1'].append(rouge['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(rouge['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(rouge['rougeL'].fmeasure)
    
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(value) / len(value) for key, value in rouge_scores.items()}
    
    return avg_bleu, avg_rouge

# Evaluate RAG model

In [28]:
# Example ground truth data
ground_truths = [
    "Our premium ham is crafted from the finest cuts, offering a tender and flavorful experience with every bite. Perfect for gourmet sandwiches, festive dinners, and special occasions.",
    "Organic cat food made with high-quality ingredients, ensuring a balanced diet for your feline friends. Rich in essential nutrients and free from artificial additives.",
    "Discover the best dog food formulated with real meat and vegetables. Provides complete nutrition for your dog's health and vitality, without any fillers or preservatives.",
    "Enjoy gluten-free snacks that are both delicious and healthy. Made with natural ingredients, these snacks are perfect for those with dietary restrictions.",
    "Low carb protein bars packed with essential nutrients and great taste. Ideal for on-the-go energy and maintaining a healthy diet without compromising flavor.",
    # Add more ground truth descriptions for other queries as needed...
]
 
# Example queries corresponding to the ground truths
queries = [
    "high quality ham",
    "organic cat food",
    "best dog food",
    "gluten-free snacks",
    "low carb protein bars",
    # Add more queries corresponding to the ground truths...
]

In [29]:
# Test the RAG system and generate recommendations
recommendations = [generate_recommendation(query) for query in queries]
print(recommendations)

["Introducing our Gourmet Soup and Beverage Party Pack, a must-have for any gathering or individual looking to elevate their culinary creations. \n\nCrafted with the finest ingredients, our ham soup base is the secret to creating a rich and flavorful ham and bean soup that will leave your guests craving for more. Paired with seasoned green beans, this soup base promises a delicious and satisfying meal experience.\n\nBut wait, there's more! Our party pack also includes a decadent chocolate liqueur cup that is sure to be a big hit at any celebration. Made with high-quality ingredients, this indulgent treat is a crowd pleaser and adds a touch of luxury to any event.\n\nFor those looking to add a refreshing twist to their beverages, our lemon juice supplement is a game-changer. Made from the freshest lemons, this product guarantees a burst of citrus flavor that will elevate your cocktails and refreshments.\n\nWith a focus on quality and customer satisfaction, our products are carefully cur

In [30]:
# Evaluate the recommendations
avg_bleu, avg_rouge = evaluate_recommendations(recommendations, ground_truths)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE Scores: {avg_rouge}")

Average BLEU Score: 0.01167168312869522
Average ROUGE Scores: {'rouge1': 0.12588040353127553, 'rouge2': 0.047828547393028886, 'rougeL': 0.09173841625691373}
