importing modules and defining functions... with variables

In [1]:
import pandas as pd
import numpy as np
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

# imports
import tiktoken

from openai.embeddings_utils import get_embedding
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
input_datapath = "data/BigBazaar.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)

let's create embedding and vector db for bigbaazar data

In [9]:
df.head(2)

Unnamed: 0,Name,Brand,Price,DiscountedPrice,BreadCrumbs,Category,SubCategory,Quantity,Description,LongDescription
0,Glass Water Tumbler Set - Acme Elite,Treo,550,299.0,Serveware / Dining & Serving,Serveware,Dining & Serving,6*300 ml,,Design:Made from premium quality Glass
1,Papad Plain,NILGIRIS,85,42.5,Packaged Food / Pickles & Papads,Packaged Food,Pickles & Papads,200 g,Nilgiris plain papad can be enjoyed when you'r...,Nilgiris plain papad can be enjoyed when you'r...


In [10]:
df.columns

Index(['Name', 'Brand', 'Price', 'DiscountedPrice', 'BreadCrumbs', 'Category',
       'SubCategory', 'Quantity', 'Description', 'LongDescription'],
      dtype='object')

In [15]:
# load & inspect dataset
# columns: Brand	Price	DiscountedPrice	BreadCrumbs	Category	SubCategory	Quantity	Description	LongDescription
df = df.dropna()
# convert all columns to string
df = df.astype(str)

In [12]:
len(df)

8219

In [26]:
# add all columns data into one columns
df["combined"] = (
    "Name: " + df.Name.str.strip() +" ; "+ 
    "Brand: " + df.Brand.str.strip() +" ; "+ 
    "Price: " + df.Price.str.strip() +" ; "+
    "DiscountedPrice: " + df.DiscountedPrice.str.strip() +" ; "+
    "BreadCrumbs: " + df.BreadCrumbs.str.strip() +" ; "+
    "Category: " + df.Category.str.strip() +" ; "+
    "SubCategory: " + df.SubCategory.str.strip() +" ; "+
    "Quantity: " + df.Quantity.str.strip() +" ; "+
    "Description: " + df.Description.str.strip() +" ; "+
    "LongDescription: " + df.LongDescription.str.strip()
)

In [27]:
encoding = tiktoken.get_encoding(embedding_encoding)
top_n = 500
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

8000

In [33]:
#calculate avg number of words in df.combined
df["n_words"] = df.combined.apply(lambda x: len(x.split()))
df.n_words.max()


429

In [35]:
# which row have the most words in df.combined using df.n_words.max()
df[df.n_words == df.n_words.max()]

Unnamed: 0,Name,Brand,Price,DiscountedPrice,BreadCrumbs,Category,SubCategory,Quantity,Description,LongDescription,combined,n_tokens,n_words
3287,Mouthwash Fresh Tea,Colgate,150,99.99,Personal Care / Oral Care,Personal Care,Oral Care,250 ml,,Colgate Plax Fresh Tea mouthwash removes over ...,Name: Mouthwash Fresh Tea ; Brand: Colgate ; P...,585,429


In [37]:
total_tokens = df.n_tokens.sum()
print(total_tokens)

941007


In [38]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("data/BigBazaar_vector_db.csv")

In [3]:
import ast
import faiss

In [5]:
df = pd.read_csv("data/BigBazaar_vector_db.csv")

In [9]:
df1 = pd.read_csv("data/BigBazaar_vector_db_1.csv")

In [10]:
df1.embedding[0]

'[-0.03161328658461571, -0.007357346825301647, 0.007983220741152763, 0.0010028962278738618, -0.009408081881701946, -0.020813634619116783, -0.036353949457407, -0.01245755236595869, -0.023703306913375854, -0.014341832138597965, 0.01019375305622816, 0.041147876530885696, -0.03371728956699371, -0.02586057409644127, -0.018776215612888336, 0.042985547333955765, 0.009980689734220505, -0.016019707545638084, 0.009254942648112774, -0.018163656815886497, -0.02990878000855446, -0.01693854294717312, -0.01403555367141962, 0.0053665353916585445, -0.026113586500287056, -0.018616417422890663, 0.02443571202456951, -0.0257407259196043, 0.015553630888462067, 0.03395698592066765, 0.008416005410254002, 0.027884677052497864, -0.005496370606124401, -0.01729808747768402, -0.006664890330284834, 0.0016013048589229584, -0.01668553054332733, -0.002658299170434475, 0.032145943492650986, 0.003180970437824726, 0.019095810130238533, 0.022185230627655983, 0.009974031709134579, 0.027005789801478386, -0.02194553427398204

In [4]:
df = pd.read_csv("data/BigBazaar_vector_db.csv")
# Apply ast.literal_eval to convert the strings back to lists
df['embedding'] = df['embedding'].apply(ast.literal_eval)
vectors = df.embedding.to_list()
vectors = np.array(vectors)
vectors.shape
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vectors)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<unknown>, line 1)

In [76]:
df = pd.read_csv("data/BigBazaar_vector_db.csv")

In [80]:
df['embedding'] = df['embedding'].apply(ast.literal_eval)
df['embedding'] = df['embedding'].apply(np.array)

In [81]:
type(df.embedding[0])

numpy.ndarray

In [82]:
#save df to csv
df.to_csv("data/BigBazaar_vector_db.csv",index=False)

In [84]:
df.columns

Index(['Unnamed: 0', 'Name', 'Brand', 'Price', 'DiscountedPrice',
       'BreadCrumbs', 'Category', 'SubCategory', 'Quantity', 'Description',
       'LongDescription', 'combined', 'n_tokens', 'n_words', 'embedding'],
      dtype='object')

In [70]:
#save index as as vector_index.pkl
faiss.write_index(index, "data/vector_index.pkl")

In [None]:
#load or read index
index = faiss.read_index("data/vector_index.pkl")



In [51]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "system", "content": """
        you will first translate the input query into english.
        you will extract data from the translated query in the format of json schema provided.
        json schema:{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "query": {
      "type": "object",
      "properties": {
        "translated_query": {
          "type": "string"
        },
        "items": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "item_name": {
                "type": "string"
              },
              "brand": {
                "type": "string"
              },
              "quantity": {
                "type": "number"
              },
              "unit": {
                "type": "string"
              },
              "Important_words": {
                "type": "array",
                "items": {
                  "type": "string"
                }
              },
              "features": {
                "type": "array",
                "items": {
                  "type": "string"
                }
              },
              "occasion": {
                "type": "array",
                "items": {
                  "type": "string"
                }
              },
              "price_range": {
                "type": "object",
                "properties": {
                  "minimum_price": {
                    "type": "number"
                  },
                  "maximum_price": {
                    "type": "number"
                  }
                },
                "required": ["minimum_price", "maximum_price"]
              }
            },
            "required": ["item_name"]
          }
        }
      },
      "required": ["translated_query", "items"]
    }
  },
  "required": ["query"]
}
        """},
        {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [52]:
print(get_completion("mujhe mere chote bhai ke liye onelplus ka phone chaiye, jisme 6 gb ram ho aur 128 gb storage ho, aur uska price 20000 se 30000 ke beech ho "))

{
    "query": {
        "translated_query": "I want an OnePlus phone for my younger brother with 6GB RAM and 128GB storage, and its price should be between 20000 and 30000.",
        "items": {
            "brand": "OnePlus",
            "quantity": 1,
            "Important_words": ["phone", "brother"],
            "features": ["6GB RAM", "128GB storage"],
            "occasion": [],
            "price_range": {
                "minimum_price": 20000,
                "maximum_price": 30000
            }
        }
    }
}


In [90]:
search_query = get_completion(input("Enter your search query: "))
print(search_query)
search_vec = get_embedding(search_query, engine=embedding_model)
search_vec = np.array(search_vec).reshape(1, -1)
distances, I = index.search(search_vec, k=5)
I.tolist()
row_indices = I.tolist()[0]
#show only these columns = ['Name', 'Brand', 'Price', 'DiscountedPrice','Category', 'SubCategory', 'Quantity']
# df1 = df[['Name', 'Brand', 'Price', 'DiscountedPrice','Category', 'SubCategory', 'Quantity']]
top_5_results = df[['Name', 'Brand', 'Price', 'DiscountedPrice','Category', 'SubCategory', 'Quantity']].loc[row_indices]
print(top_5_results.to_dict(orient="records"))

{
    "query": {
        "translated_query": "phone and wheat",
        "items": [
            {
                "brand": "phone",
                "quantity": 1,
                "important_words": ["phone"],
                "features": [],
                "occasion": [],
                "price_range": {
                    "minimum_price": 0,
                    "maximum_price": 0
                }
            },
            {
                "brand": "wheat",
                "quantity": 1,
                "important_words": ["wheat"],
                "features": [],
                "occasion": [],
                "price_range": {
                    "minimum_price": 0,
                    "maximum_price": 0
                }
            }
        ]
    }
}
[{'Name': 'Pizza Wheat Bread Pack of 2', 'Brand': 'NILGIRIS', 'Price': '40', 'DiscountedPrice': 40.0, 'Category': 'Bakery, Dairy & Frozen', 'SubCategory': 'Bread & Bakery', 'Quantity': '150 g'}, {'Name': 'Wheat Daliya Small', 'Brand

In [150]:
df.to_csv("data/BigBazaar_vector_db_1.csv")

In [60]:
# convert search_query to json
import json
search_query = json.loads(search_query)
search_query

{'query': {'translated_query': 'I want rice bag, wheat, biscuits, and chillies',
  'items': [{'brand': 'rice bag',
    'quantity': 1,
    'important_words': ['rice'],
    'features': [],
    'occasion': [],
    'price_range': {'minimum_price': 0, 'maximum_price': 0}},
   {'brand': 'wheat',
    'quantity': 1,
    'important_words': ['wheat'],
    'features': [],
    'occasion': [],
    'price_range': {'minimum_price': 0, 'maximum_price': 0}},
   {'brand': 'biscuits',
    'quantity': 1,
    'important_words': ['biscuits'],
    'features': [],
    'occasion': [],
    'price_range': {'minimum_price': 0, 'maximum_price': 0}},
   {'brand': 'chillies',
    'quantity': 1,
    'important_words': ['chillies'],
    'features': [],
    'occasion': [],
    'price_range': {'minimum_price': 0, 'maximum_price': 0}}]}}

In [66]:
search_query["query"]["items"][0]

{'brand': 'rice bag',
 'quantity': 1,
 'important_words': ['rice'],
 'features': [],
 'occasion': [],
 'price_range': {'minimum_price': 0, 'maximum_price': 0}}

In [11]:
index = faiss.read_index("data/vector_index.pkl")

In [12]:
import pickle 
import lzma
with lzma.open("vector_index.xz", "wb") as f:
    pickle.dump(index, f)

In [2]:
import pandas as pd
df = pd.read_csv("data/BigBazaar_vector_db.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Brand,Price,DiscountedPrice,BreadCrumbs,Category,SubCategory,Quantity,Description,LongDescription,combined,n_tokens,n_words,embedding
0,229,Anti Acne Facewash Neem,Himalaya,210,210.0,Skin Care & Beauty / Skin Care,Skin Care & Beauty,Skin Care,200 ml,Himalaya Turmeric Herbals Purifying Neem Face ...,Purifying Neem Face Wash,Name: Anti Acne Facewash Neem ; Brand: Himalay...,118,79,[ 0.01035868 0.01985359 0.00960615 ... -0.02...
1,230,Anti Acne Facewash Neem,Himalaya,170,145.0,Skin Care & Beauty / Skin Care,Skin Care & Beauty,Skin Care,150 ml,Himalaya Turmeric Herbals Purifying Neem Face ...,Purifying Neem Face Wash,Name: Anti Acne Facewash Neem ; Brand: Himalay...,118,79,[ 0.00945115 0.0220004 0.00897402 ... -0.02...
2,231,Anti Acne Facewash Neem,Himalaya,130,117.0,Skin Care & Beauty / Skin Care,Skin Care & Beauty,Skin Care,100 ml,Himalaya Turmeric Herbals Purifying Neem Face ...,Purifying Neem Face Wash,Name: Anti Acne Facewash Neem ; Brand: Himalay...,118,79,[ 0.00863239 0.01953785 0.00575493 ... -0.02...
3,232,Mango Drink,FROOTI,105,105.0,Beverages / Soft Drinks & Juices,Beverages,Soft Drinks & Juices,2.25 L,,Frooti Mango Drink is something that would boo...,Name: Mango Drink ; Brand: FROOTI ; Price: 105...,116,73,[-0.00132543 -0.0154933 0.00209192 ... -0.01...
4,233,Mango Drink,FROOTI,100,100.0,Beverages / Soft Drinks & Juices,Beverages,Soft Drinks & Juices,2 L,,Frooti Mango Drink is something that would boo...,Name: Mango Drink ; Brand: FROOTI ; Price: 100...,114,73,[ 0.0016192 -0.01780541 -0.00098317 ... -0.02...


In [3]:
#drop last 4 columns
df = df.iloc[:, :-4]
df.to_csv("data/BigBazaar_vector_db_git.csv",index=False)