In [1]:
import pandas as pd
import numpy as np
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

# imports
import tiktoken

from openai.embeddings_utils import get_embedding
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
input_datapath = "data/BigBazaar.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)

In [2]:
# load & inspect dataset
# columns: Brand	Price	DiscountedPrice	BreadCrumbs	Category	SubCategory	Quantity	Description	LongDescription
df = df.dropna()
# convert all columns to string
df = df.astype(str)

In [3]:
# add all columns data into one columns
df["combined"] = (
    "Name: " + df.Name.str.strip() +" ; "+ 
    "Brand: " + df.Brand.str.strip() +" ; "+ 
    "Price: " + df.Price.str.strip() +" ; "+
    "DiscountedPrice: " + df.DiscountedPrice.str.strip() +" ; "+
    "BreadCrumbs: " + df.BreadCrumbs.str.strip() +" ; "+
    "Category: " + df.Category.str.strip() +" ; "+
    "SubCategory: " + df.SubCategory.str.strip() +" ; "+
    "Quantity: " + df.Quantity.str.strip() +" ; "+
    "Description: " + df.Description.str.strip() +" ; "+
    "LongDescription: " + df.LongDescription.str.strip()
)

In [4]:
encoding = tiktoken.get_encoding(embedding_encoding)
top_n = 1000
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
#sort df with n_tokens in descending order and choose top_n 
df = df.sort_values("n_tokens", ascending=False)
df = df[df.n_tokens <= max_tokens].head(top_n)
len(df)

1000

In [5]:
df["n_words"] = df.combined.apply(lambda x: len(x.split()))
df.n_words.max()


429

In [6]:
total_tokens = df.n_tokens.sum()
print(total_tokens)

236168


In [7]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("data/BigBazaar_head_1000_vector_db.csv")

In [8]:
df.head()

Unnamed: 0,Name,Brand,Price,DiscountedPrice,BreadCrumbs,Category,SubCategory,Quantity,Description,LongDescription,combined,n_tokens,n_words,embedding
23946,1 Litre Cordless Electric Kettle 1100W KEK104R...,KORYO,1299.0,699.0,Kitchen Appliances / Home & Kitchen Appliances,Kitchen Appliances,Home & Kitchen Appliances,1 Pcs,,"\n <li class=""MsoNormal"" style=""color:black;ms...",Name: 1 Litre Cordless Electric Kettle 1100W K...,638,168,"[0.027253031730651855, -0.01505531556904316, -..."
26504,GX 3701 750W Mixer Grinder with Nutri-Pro Feat...,Bajaj,5350.0,3999.0,Electronics / Kitchen Appliances / Home & Kitc...,Electronics,Home & Kitchen Appliances,1 Pcs,,"\n <li class=""MsoListParagraph"" style=""color:#...",Name: GX 3701 750W Mixer Grinder with Nutri-Pr...,611,196,"[0.002546899951994419, -0.0013742265291512012,..."
18698,"Mixing Bowl - Oven & Microwave Safe, 900 ml",BOROSIL,410.0,369.0,Cooking and Baking / Bakeware,Cooking and Baking,Bakeware,1 Pcs,,Colour TransparentPattern BowlItem Dimensions ...,"Name: Mixing Bowl - Oven & Microwave Safe, 900...",594,420,"[-0.0077776857651770115, -0.029939765110611916..."
3287,Mouthwash Fresh Tea,Colgate,150.0,99.99,Personal Care / Oral Care,Personal Care,Oral Care,250 ml,,Colgate Plax Fresh Tea mouthwash removes over ...,Name: Mouthwash Fresh Tea ; Brand: Colgate ; P...,585,429,"[0.011543926782906055, 0.002320263534784317, 0..."
397,Glasstop Gas Stove Premia 2 Burner D,PRESTIGE,10195.0,7136.5,Cooking and Baking / Gas stove & Induction,Cooking and Baking,Gas stove & Induction,1 Pcs,,Gas stove type: Manual; Burner material: Brass...,Name: Glasstop Gas Stove Premia 2 Burner D ; B...,583,390,"[0.01492131408303976, -0.005848828703165054, -..."
