# LOADING THE LIBRARIES

In [1]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.27.1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2

In [10]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.5.1


In [2]:
import chromadb

In [3]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials



In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
downloaded = drive.CreateFile({'id':'1knKLc16AwF39X1RMq0v92dzN008l--pH'})#replace withid in your google drive for product codes file
downloaded.GetContentFile('Product code.csv')
#https://drive.google.com/file/d/192Wh5hV6Dwz5QcF14q7ItZ97PfcFAoih/view?usp=drive_link

# LOADING THE DATA

Column Descriptors
- StockCode: A code used to identify the product that was purchased
- Description: A brief description of the product that was purchased
- Details: Detailed description of the purchased product

In [6]:
import csv

# Load Product Code data
with open('Product code.csv') as file:
    lines = csv.reader(file)

    # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
    documents = []

    # Store the corresponding menu item IDs in this array.
    metadatas = []

    # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
    ids = []
    id = 1

    # Loop thru each line and populate the 3 arrays.
    for i, line in enumerate(lines):
        if i==0:
            # Skip the first row (the column headers)
            continue

        documents.append(line[1])
        metadatas.append({"item_id": line[0]})
        ids.append(str(id))
        id+=1

In [7]:
len(metadatas)

3839

In [41]:
from google.colab import drive

In [42]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Instantiate the Chroma client





Create collection for embeddings

Query collection

In [43]:
from chromadb.utils import embedding_functions

# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="/content/drive/My Drive/Colab Notebooks/")

In [44]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Use this to delete the database
# chroma_client.delete_collection(name="my_collection")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)

In [45]:
# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


# Query the vector database

In [46]:


# Query mispelled word: 'PATRY BUKNING'. Expect to find the correctly spelled  items
results = collection.query(
    query_texts=["PATRY BUKNING"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])



[['RED BABY BUNTING ', 'PARTY BUNTING', 'SPOTTY BUNTING', 'PINK BABY BUNTING', 'ALARM CLOCK BAKELIKE CHOCOLATE']]
[[1.31281578540802, 1.3319528102874756, 1.3514946699142456, 1.3577312231063843, 1.3757050037384033]]
[[{'item_id': '22669'}, {'item_id': '47566'}, {'item_id': '23298'}, {'item_id': '22668'}, {'item_id': '22725'}]]


In [47]:

# Query mispelled word: 'PATRY BUKNING'. Expect to find the correctly spelled  items

results = collection.query(
    query_texts=["PATRY BUKNING"],
    n_results=10,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['RED BABY BUNTING ', 'PARTY BUNTING', 'SPOTTY BUNTING', 'PINK BABY BUNTING', 'ALARM CLOCK BAKELIKE CHOCOLATE', 'ALARM CLOCK BAKELIKE IVORY', 'PAPER BUNTING PAISLEY PARK', 'CANDY SPOT BUNNY', 'ALARM CLOCK BAKELIKE ORANGE', 'PIN CUSHION BABUSHKA RED']]
[[1.31281578540802, 1.3319528102874756, 1.3514946699142456, 1.3577312231063843, 1.3757050037384033, 1.38956618309021, 1.392085375508959, 1.3989320993423462, 1.4158439636230469, 1.4181098937988281]]
[[{'item_id': '22669'}, {'item_id': '47566'}, {'item_id': '23298'}, {'item_id': '22668'}, {'item_id': '22725'}, {'item_id': '22730'}, {'item_id': '23597'}, {'item_id': '85089'}, {'item_id': '22729'}, {'item_id': '22448'}]]


In [48]:
# Query word variation: 'REGNCY CAKESTND 3 TIER'. Expect to find the 'REGENCY CAKESTAND 3 TIER' item

results = collection.query(
    query_texts=["REGNCY CAKESTND 3 TIER"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])



[['REGENCY CAKESTAND 3 TIER', 'SWEETHEART CAKESTAND 3 TIER', '3 TIER CAKE TIN RED AND CREAM', '3 TIER CAKE TIN GREEN AND CREAM', 'CAKE STAND 3 TIER MAGIC GARDEN']]
[[0.5575416684150696, 0.6835066080093384, 0.8116592168807983, 0.8361910581588745, 0.8596285581588745]]
[[{'item_id': '22423'}, {'item_id': '22776'}, {'item_id': '22838'}, {'item_id': '22839'}, {'item_id': '22236'}]]


In [49]:
# Query word variation: 'REGNCY CAKESTND 3 TIER'. Expect to find the 'REGENCY CAKESTAND 3 TIER' item

results = collection.query(
    query_texts=["REGNCY CAKESTND 3 TIER"],
    n_results=10,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])


[['REGENCY CAKESTAND 3 TIER', 'SWEETHEART CAKESTAND 3 TIER', '3 TIER CAKE TIN RED AND CREAM', '3 TIER CAKE TIN GREEN AND CREAM', 'CAKE STAND 3 TIER MAGIC GARDEN', 'NOVELTY BISCUITS CAKE STAND 3 TIER', 'CAKE STAND LOVEBIRD 2 TIER WHITE', 'CAKE STAND LOVEBIRD 2 TIER PINK', 'CAKE STAND WHITE TWO TIER LACE', 'REGENCY CAKE SLICE']]
[[0.5575416684150696, 0.6835066080093384, 0.8116592168807983, 0.8361910581588745, 0.8596285581588745, 0.9351198673248291, 1.0401690006256104, 1.0519944429397583, 1.0538241863250732, 1.1033684015274048]]
[[{'item_id': '22423'}, {'item_id': '22776'}, {'item_id': '22838'}, {'item_id': '22839'}, {'item_id': '22236'}, {'item_id': '22890'}, {'item_id': '22220'}, {'item_id': '22221'}, {'item_id': '22215'}, {'item_id': '23164'}]]


In [50]:
# Query similar meaning: 'WHITE HANGING HEART'. Expect to find the 'SIMILAR' items

results = collection.query(
    query_texts=["WHITE HANGING HEART"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['HANGING HEART BONHEUR', 'PLACE SETTING WHITE HEART', 'YELLOW FELT HANGING HEART W FLOWER', 'BLUE FELT HANGING HEART W FLOWER', 'CANDLEHOLDER PINK HANGING HEART']]
[[0.6221920251846313, 0.6616895198822021, 0.6793512105941772, 0.699192225933075, 0.7164210081100464]]
[[{'item_id': '23398'}, {'item_id': '22151'}, {'item_id': '35916A'}, {'item_id': '35916B'}, {'item_id': '22804'}]]


In [51]:
results = collection.query(
    query_texts=["WHITE HANGING HEART"],
    n_results=10,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['HANGING HEART BONHEUR', 'PLACE SETTING WHITE HEART', 'YELLOW FELT HANGING HEART W FLOWER', 'BLUE FELT HANGING HEART W FLOWER', 'CANDLEHOLDER PINK HANGING HEART', 'PINK FELT HANGING HEART W FLOWER', 'BLACK HEART CARD HOLDER', 'IVORY HANGING DECORATION  HEART', 'ENGLISH ROSE SCENTED HANGING HEART', 'YELLOW METAL CHICKEN HEART ']]
[[0.6221920251846313, 0.6616895198822021, 0.6793512105941772, 0.699192225933075, 0.7164210081100464, 0.71767258644104, 0.7392807602882385, 0.7560070753097534, 0.7591907978057861, 0.7709988951683044]]
[[{'item_id': '23398'}, {'item_id': '22151'}, {'item_id': '35916A'}, {'item_id': '35916B'}, {'item_id': '22804'}, {'item_id': '35916C'}, {'item_id': '22188'}, {'item_id': '21385'}, {'item_id': '47574B'}, {'item_id': '84459B'}]]


In [54]:
results = collection.query(
    query_texts=["I am looking for a door mat"],
    n_results=15,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['DOORMAT VINTAGE LEAVES DESIGN ', 'DOORMAT MULTICOLOUR STRIPE', 'DOORMAT NEW ENGLAND', 'RECYCLED ACAPULCO MAT BLUE', 'RECYCLED ACAPULCO MAT PINK', 'RECYCLED ACAPULCO MAT RED', 'RECYCLED ACAPULCO MAT GREEN', 'BLACK ENCHANTED FOREST PLACEMAT', 'DOORMAT WELCOME TO OUR HOME', 'KNEELING MAT HOUSEWORK  DESIGN', 'DOORMAT TOPIARY', 'SQUARE FLOOR CUSHION VINTAGE RED', 'RECYCLED ACAPULCO MAT TURQUOISE', 'RECYCLED ACAPULCO MAT LAVENDER', 'DOORMAT WELCOME PUPPIES']]
[[1.0683703422546387, 1.075612187385559, 1.0783156156539917, 1.0889254808425903, 1.090099573135376, 1.1124440431594849, 1.1128441095352173, 1.1211400032043457, 1.1349525451660156, 1.1440937519073486, 1.1624468564987183, 1.1683390140533447, 1.171748161315918, 1.1723747253417969, 1.175325870513916]]
[[{'item_id': '23283'}, {'item_id': '48116'}, {'item_id': '48187'}, {'item_id': '23051'}, {'item_id': '23053'}, {'item_id': '23049'}, {'item_id': '23050'}, {'item_id': '85114A'}, {'item_id': '22692'}, {'item_id': '23302'}, {'item_id': '4812

In [55]:
results = collection.query(
    query_texts=["Pls suggest a hand warmer"],
    n_results=15,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['HI TEC ALPINE HAND WARMER', 'HAND WARMER BABUSHKA DESIGN', 'LOVE HEART POCKET WARMER', 'HAND WARMER RED POLKA DOT', 'HOT WATER BOTTLE I AM SO POORLY', 'HAND WARMER UNION JACK', 'HAND WARMER SCOTTY DOG DESIGN', 'HAND WARMER RED LOVE HEART', 'TEA TIME OVEN GLOVE', 'HAND WARMER BIRD DESIGN', 'HAND WARMER OWL DESIGN', 'HOT WATER BOTTLE KEEP CALM', 'HOT WATER BOTTLE TEA AND SYMPATHY', 'HOT STUFF HOT WATER BOTTLE', 'HOT WATER BOTTLE BABUSHKA ']]
[[0.7037321329116821, 0.8033459186553955, 0.9284428954124451, 0.937609076499939, 0.9381437301635742, 0.9641802310943604, 0.9826898574829102, 0.9888186454772949, 1.0015605688095093, 1.014425277709961, 1.018113374710083, 1.0559507608413696, 1.0649542808532715, 1.0894415378570557, 1.0988456010818481]]
[[{'item_id': '70007'}, {'item_id': '22834'}, {'item_id': '70006'}, {'item_id': '22632'}, {'item_id': '22835'}, {'item_id': '22633'}, {'item_id': '22866'}, {'item_id': '23439'}, {'item_id': '47559B'}, {'item_id': '22867'}, {'item_id': '22865'}, {'item_i

In [56]:
results = collection.query(
    query_texts=["I want a cosmetic bag"],
    n_results=15,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['PINK GREEN EMBROIDERY COSMETIC BAG', 'BLUE GREEN EMBROIDERY COSMETIC BAG', 'GOLD COSMETIC BAG PINK STAR', 'GIRLS PARTY BAG', 'GOLD COSMETICS BAG WITH BUTTERFLY', 'COSMETIC BAG VINTAGE ROSE PAISLEY', 'RED RETROSPOT SHOPPING BAG', 'VINTAGE BEAD COSMETIC BAG ', 'LARGE STRIPES CHOCOLATE GIFT BAG', 'PARTY FOOD SHOPPER BAG', 'RED RETROSPOT SHOPPER BAG', 'ROSE DU SUD COSMETICS BAG', 'TROPICAL HOLIDAY PURSE ', 'SMALL STRIPES CHOCOLATE GIFT BAG ', 'BOYS PARTY BAG']]
[[0.6450399160385132, 0.6657259464263916, 0.6777951717376709, 0.7088884711265564, 0.7406556606292725, 0.757064163684845, 0.7732295989990234, 0.7738257646560669, 0.7830039858818054, 0.7876776456832886, 0.7895126342773438, 0.7935220003128052, 0.8047601580619812, 0.8083751201629639, 0.8165217638015747]]
[[{'item_id': '47369A'}, {'item_id': '47369B'}, {'item_id': '20861'}, {'item_id': 'DCGSSGIRL'}, {'item_id': '20860'}, {'item_id': '22277'}, {'item_id': '21039'}, {'item_id': '35649'}, {'item_id': '21289'}, {'item_id': '20716'}, {'ite

In [58]:
results = collection.query(
    query_texts=["I want a cosmetic bag"],
    n_results=3,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])

[['PINK GREEN EMBROIDERY COSMETIC BAG', 'BLUE GREEN EMBROIDERY COSMETIC BAG', 'GOLD COSMETIC BAG PINK STAR']]
[[0.6450399160385132, 0.6657259464263916, 0.6777951717376709]]
[[{'item_id': '47369A'}, {'item_id': '47369B'}, {'item_id': '20861'}]]


In [59]:
import pandas as pd

In [60]:
data1=pd.read_csv('Product code.csv')

In [61]:
data1.head(1)

Unnamed: 0,StockCode,Description,Details
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,The White Hanging Heart T-Light Holder is a ch...


In [62]:
data2 = data1.Description.tolist()


In [63]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [64]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = data2

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

#print("Sentence embeddings:")
#print(sentence_embeddings)


In [65]:
sentence_embeddings.shape

torch.Size([3839, 768])

In [66]:
data1.shape

(3839, 3)

In [67]:
data1['Embeddings']=sentence_embeddings.tolist()

In [68]:
data1.head(3)

Unnamed: 0,StockCode,Description,Details,Embeddings
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,The White Hanging Heart T-Light Holder is a ch...,"[0.07632849365472794, -0.09431950002908707, -0..."
1,71053,WHITE METAL LANTERN,A white metal lantern is a decorative accessor...,"[0.05321896821260452, -0.03676013648509979, 0...."
2,84406B,CREAM CUPID HEARTS COAT HANGER,The CREAM CUPID HEARTS COAT HANGER is an adora...,"[0.03787032142281532, -0.055375922471284866, -..."


In [69]:
from google.colab import drive

In [70]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
drive_folder_path = '/content/drive/My Drive/Colab Notebooks/'

In [72]:
data1.to_csv("/content/drive/My Drive/Colab Notebooks/Product_embeddings_Hf.csv")