In [2]:
import pandas as pd
import numpy as np

# Reading the products csv file

In [4]:
df = pd.read_csv('products.csv')
df

Unnamed: 0,title,brand,categories,description,tags,image-link,site-link
0,Wooden Styling Comb,Eco Living,"['Haircare', 'Brushes & Combs']",A beautiful beech wood comb with rounded teeth...,"['Natural', 'Plastic Free', 'Biodegradable', '...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/woo...
1,Organic Linen Reusable Coffee Filters No 4 – 2...,Marley's Monsters,"['Kitchen', 'Kitchen Essentials']",Reusable coffee filters in the cone style by M...,"['Vegan', 'Natural', 'Plastic Free', 'Biodegra...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/org...
2,Bamboo Toothbrush Soft Bristles – 4 Pack,Bambaw,"['Bathroom', 'Toothbrushes']",A family pack of 4 sustainably sourced bamboo ...,"['Vegan', 'Sustainable']",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bam...
3,Wide Neck Baby Glass Bottle With Sleeve – Seaf...,Hevea,"['Mama & Baby', 'Baby Bottles']","HEVEA Wide Neck Baby Glass Bottle, the first D...","['Vegan', 'Palm Oil Free', 'Plastic Free', 'Re...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/wid...
4,Eye Time – Caffeinated Probiotic Eye Cream,Awake Organics,"['Skincare', 'Eye Creams']",Eye Time is a lightweight eye cream that glide...,"['Vegan', 'Natural', 'Recyclable', 'Handmade',...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/eye...
...,...,...,...,...,...,...,...
2591,Beeswax Wraps – Sandwich & Big Bowl 2 Pack – I...,Honey Bee Good,['Kitchen'],A pack of 2 beeswax wraps made with 100% certi...,"['Natural', 'Plastic Free', 'Biodegradable', '...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bee...
2592,Pure Linen Peg Bag – Stripes,Helen Round,"['For The Home', 'All Laundry', 'Laundry Acces...","This practical peg bag, made from beautifully ...","['Plastic Free', 'Handmade', 'Sustainable', 'M...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/pur...
2593,Lemongrass & Tea Tree Soap Bar – 100g,Wild Sage & Co,"['Bathroom', 'Soap Bars', 'Hand & Body Soap Ba...","Wild Sage & Co lemongrass and tea tree, a fres...","['Vegan', 'Natural', 'Plastic Free', 'Biodegra...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/lem...
2594,Peppa Pig Wash Bar with Rose Water & Aloe Vera...,Good Bubble,"['Mama & Baby', 'Baby Skincare']","A super gently baby hair and body wash bar, su...","['Vegan', 'Natural', 'Plastic Free', 'Handmade...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bab...


# Combine text fields into a single one

In [5]:
import ast # to convert the string lists in the dataframe to actual lists

# Example of a product
item = df.iloc[27]

def combine_product_info(item):
    """
    Combine product information into a single string.
    """
    return (f"{item['title']}. "
            f"Brand: {item['brand']}. "
            f"Categories: {", ".join(ast.literal_eval(item['categories']))}. "
            f"Tags: {", ".join(ast.literal_eval(item['tags']))}. "
            f"{item['description']}")

print(combine_product_info(item))

Hunter’s Icy Adventure – Sustainable Children’s Book. Brand: Wild Tribe Heroes. Categories: Mama & Baby, Toys & Books, Children's Books. Tags: Plastic Free, Vegan, Made In UK. The Wild Tribe Heroes books are a collection of gentle and engaging true stories about animals that find themselves in trouble when their lives are affected by plastic in the oceans or palm oil deforestation. Follow each one of our heroes to see what happens, how they are saved and what you can do to help them! Children and adults alike will love these books and feel inspired to make positive changes to their own lives and their communities.


In [6]:
df['combined_text'] = df.apply(combine_product_info, axis=1)
df['combined_text']

0       Wooden Styling Comb. Brand: Eco Living. Catego...
1       Organic Linen Reusable Coffee Filters No 4 – 2...
2       Bamboo Toothbrush Soft Bristles – 4 Pack. Bran...
3       Wide Neck Baby Glass Bottle With Sleeve – Seaf...
4       Eye Time – Caffeinated Probiotic Eye Cream. Br...
                              ...                        
2591    Beeswax Wraps – Sandwich & Big Bowl 2 Pack – I...
2592    Pure Linen Peg Bag – Stripes. Brand: Helen Rou...
2593    Lemongrass & Tea Tree Soap Bar – 100g. Brand: ...
2594    Peppa Pig Wash Bar with Rose Water & Aloe Vera...
2595    Lemongrass Shaving Soap – 100g. Brand: Peace W...
Name: combined_text, Length: 2596, dtype: object

# Embed the Combined Text using Gemini Embeddings API

## Testing Embeddings

In [7]:
from google import genai
from google.genai import types
import time
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set up the GenAI client
client = genai.Client(api_key=os.getenv("API_KEY"))

query = "Wooden, natural, plastic free Comb"

# Embed the combined text and the query
response = client.models.embed_content(
    model="text-embedding-004",
    contents=[df.iloc[0]['combined_text']],
    config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"))

embedding1 = response.embeddings[0].values

response = client.models.embed_content(
    model="text-embedding-004",
    contents=[query],
    config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY"))

embedding2 = response.embeddings[0].values

# Calculate similarity between the two embeddings using dot product
similarity = np.dot(embedding1, embedding2)
print("Embedded Text: " + df.iloc[0]['combined_text'])
print("Query: " + query) 
print("Similarity: " + str(similarity))

Embedded Text: Wooden Styling Comb. Brand: Eco Living. Categories: Haircare, Brushes & Combs. Tags: Natural, Plastic Free, Biodegradable, Sustainable. A beautiful beech wood comb with rounded teeth. Natural or wooden bristles are gentle to the hair structure and avoid damage, would suit thick or curly hair.
Query: Wooden, natural, plastic free Comb
Similarity: 0.7327470204778788


## Embeddings the Combined Text Column

In [8]:
# Rate Limiting
REQUESTS_PER_MINUTE = 1500
# Google's text-embedding-004 allows batching up to 100 documents per request
BATCH_SIZE = 100 # Maximize batch size to reduce number of API calls
# Calculate how many batches we can make per minute
BATCHES_PER_MINUTE = REQUESTS_PER_MINUTE / 1 # 1 API call per batch
# Calculate delay needed between batches
SECONDS_PER_MINUTE = 60
DELAY_BETWEEN_BATCHES = SECONDS_PER_MINUTE / BATCHES_PER_MINUTE
print(f"Batch Size: {BATCH_SIZE}")
print(f"Required delay between batches: {DELAY_BETWEEN_BATCHES:.4f} seconds")


# --- Generate Embeddings with Batching and Rate Limiting ---
print("\nStarting embedding generation...")

# Constants
client = genai.Client(api_key=os.getenv("API_KEY"))

embeddings_list = []
error_indices = []
retries = 3 # Number of retries for a failed batch

# Use tqdm for progress bar
total_batches = int(np.ceil(len(df) / BATCH_SIZE))
print(f"Total rows: {len(df)}, Total batches: {total_batches}")

# Process data in batches
for i in range(0, len(df), BATCH_SIZE):
    batch_texts = df['combined_text'][i:i + BATCH_SIZE].tolist()

    response = client.models.embed_content(
        model="text-embedding-004",
        contents=batch_texts,
        config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"))

    for embedding in response.embeddings:
        embeddings_list.append(embedding.values)
    print(f"Batch {int(i/100 + 1)} Complete")

    # --- Rate Limiting Delay ---
    time.sleep(DELAY_BETWEEN_BATCHES)


# --- 3. Add Embeddings to DataFrame ---
print("\nEmbedding generation complete.")

if error_indices:
    print(f"Warning: Errors occurred for batches starting at indices: {error_indices}")
    print("Embeddings for these batches will be None.")

# Check if the number of embeddings matches the dataframe length
if len(embeddings_list) == len(df):
    df['embeddings'] = embeddings_list
    print("Embeddings column added successfully.")
    # Check how many embeddings are None (due to errors)
    null_embeddings_count = df['embeddings'].isnull().sum()
    if null_embeddings_count > 0:
         print(f"Number of rows with failed embeddings (None): {null_embeddings_count}")
else:
    print(f"Error: Number of generated embeddings ({len(embeddings_list)}) does not match DataFrame length ({len(df)}).")
    print("Embeddings column was not added. Please check errors.")


Batch Size: 100
Required delay between batches: 0.0400 seconds

Starting embedding generation...
Total rows: 2596, Total batches: 26
Batch 1 Complete
Batch 2 Complete
Batch 3 Complete
Batch 4 Complete
Batch 5 Complete
Batch 6 Complete
Batch 7 Complete
Batch 8 Complete
Batch 9 Complete
Batch 10 Complete
Batch 11 Complete
Batch 12 Complete
Batch 13 Complete
Batch 14 Complete
Batch 15 Complete
Batch 16 Complete
Batch 17 Complete
Batch 18 Complete
Batch 19 Complete
Batch 20 Complete
Batch 21 Complete
Batch 22 Complete
Batch 23 Complete
Batch 24 Complete
Batch 25 Complete
Batch 26 Complete

Embedding generation complete.
Embeddings column added successfully.


## Testing if the texts are properly embedded

In [10]:
df[['embeddings']]

Unnamed: 0,embeddings
0,"[-0.05316057, 0.03311698, 0.010515335, -0.0205..."
1,"[-0.022856688, 0.0035480985, 0.024170972, 0.00..."
2,"[0.010721589, 0.0043742913, -0.024230765, -0.0..."
3,"[0.032509547, -0.00430838, 0.0014149724, -0.02..."
4,"[-0.01010536, -0.033673253, -0.009111277, -0.0..."
...,...
2591,"[-0.02776007, 0.010913679, 0.022175308, -0.017..."
2592,"[-0.012673198, 0.024075322, 0.0066878675, 0.03..."
2593,"[0.019729247, -0.001510754, -0.035353225, -0.0..."
2594,"[0.031478737, -0.0063198153, -0.07759639, -0.0..."


In [9]:
print(embedding1 == df['embeddings'][0])

True


## Querying the embedded dataframe

In [18]:
query = "Eco-friendly shampoo alternatives such as shampoo bars, shampoo liquids, shampoo cubes or shampoo powder."
response = client.models.embed_content(
    model="text-embedding-004",
    contents=[query],
    config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY"))

embedded_query = response.embeddings[0].values

# Calculate similarity between the query and all embeddings
dot_products = np.dot(np.stack(df['embeddings']), embedded_query)
print(f"Dot products: {len(dot_products)}")
idx = np.argsort(dot_products)[::-1]  # Sort indices by similarity

# Get the top 5 most similar products
top_5_indices = idx[:5]
top_5_products = df.iloc[top_5_indices]
print("Top 5 most similar products:")
for i, product in enumerate(top_5_products.itertuples(), start=1):
    print(f"Rank {i}: {product.title} (Similarity: {dot_products[top_5_indices[i-1]]:.4f})")
    print(f"Brand: {product.brand}")
    print(f"Categories: {', '.join(ast.literal_eval(product.categories))}")
    print(f"Tags: {', '.join(ast.literal_eval(product.tags))}")
    print(f"Description: {product.description}\n")

Dot products: 2596
Top 5 most similar products:
Rank 1: 30 Shampoo and Body Wash Cubes – Sensitive Scalp (Similarity: 0.6916)
Brand: Hedgerow & Moor
Categories: Haircare, All Shampoo, Shampoo Cubes
Tags: Vegan, Palm Oil Free, Natural, Plastic Free, Biodegradable, Handmade, Made In UK, Sustainable
Description: A unique, sustainable shampoo and body wash that creates an amazing lather and is fragrance free, making it perfect for those with allergies or sensitivities. Plus, the prebiotic blend helps keep your scalp healthy while the chickweed powder, mustard seed oil and passionflower oil leave your hair looking soft, shiny and healthy. From the creators of Beauty Kubes, these plastic free shampoo cubes are the perfect, guilt-free solution to achieving healthy hair and scalp.

Rank 2: 30 Shampoo and Body Wash Cubes – Men (Similarity: 0.6899)
Brand: Hedgerow & Moor
Categories: Haircare, All Shampoo, Shampoo Cubes
Tags: Vegan, Palm Oil Free, Natural, Plastic Free, Biodegradable, Handmade, M

# Writing the embeddings to CSV

In [12]:
import csv
df.to_csv('products_with_embeddings.csv', index=False)

# Calculating unique Categories and tags to be used later

In [15]:
# get list of unique categories
categories = df['categories'].apply(ast.literal_eval).explode().unique()

# get list of unique tags
tags = df['tags'].apply(ast.literal_eval).explode().unique()

print("Unique Categories:")
print(list(categories))
print("Unique Tags:")
print(tags)

Unique Categories:
['Haircare', 'Brushes & Combs', 'Kitchen', 'Kitchen Essentials', 'Bathroom', 'Toothbrushes', 'Mama & Baby', 'Baby Bottles', 'Skincare', 'Eye Creams', 'Soap Bars', 'Facial Cleansing Soap Bars', 'Food & Drink', 'Tea', 'All Shampoo', 'Shampoo Bars', 'Body Oil', 'Deodorants', 'Deodorant Tins', 'For The Home', 'Makeup', 'Complexion', 'Foundations', 'On-The-Go', 'Produce Bags', 'Glitter', 'Glitter Sets', 'Gifts', 'Gift Sets', 'Skincare Accessories', 'All Conditioners', 'Conditioner Liquid', 'Pets Supplies', 'Dog Treats', 'Deodorant Sticks', 'Facial Serums', 'Toys & Books', "Children's Books", 'Suncream', 'Gardening', 'Face Masks', 'Shampoo Liquid', 'Stationery', 'Gift Wrap', 'Eyes', 'Eyeshadows', 'Shampoo Cubes', 'Nuts & Seeds', 'Seeds', 'Dish Cloths & Towels', 'Hair Treatments & Masks', 'Safety Razors', 'Poop Bags', 'Flour & Baking', 'Flours & Baking', 'Loose Tea', 'Kids Toothbrushes', 'Chocolate', 'All Drinking Bottles', 'Glass Bottles', 'Pacifiers & Rattles', 'Nuts', 'H