In [None]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import chromadb

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
vector_size = 128
model_path = f'./models/w2v-{vector_size}.model'

model = Word2Vec.load(model_path)

In [None]:
def vectorizeText(text):
  tokens = gensim.utils.simple_preprocess(text)
  text_vector = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
  text_vector = np.array(text_vector)

  return list(text_vector)

In [None]:
client = chromadb.PersistentClient('./client')

df = pd.read_csv("./data/inventory.csv")
categories = df['categoryName'].unique()

# categories

In [None]:
collection = client.get_or_create_collection('categories', embedding_function=vectorizeText)

collection.add(
  documents=list(categories),
  ids=[('cat' + str(i)) for i in range(len(categories))]
)

In [None]:
def addDocuments(collection, documents, metadatas, ids):
  chunk_size = 40000

  for i in range(0, len(ids), chunk_size):
    collection.add(
      documents=documents[i:i+chunk_size],
      metadatas=metadatas[i:i+chunk_size],
      ids=ids[i:i+chunk_size]
    )

In [None]:
for category in categories:
  collection = client.get_or_create_collection(
    f'inventory_emb-{vector_size}', 
    embedding_function=vectorizeText
  )

  mask = df['categoryName'] == category
  filtered = df[mask]

  filtered.drop(columns=['categoryName'], inplace=True)
  documents = list(filtered.pop('title'))
  metadatas = [entry.to_dict() for index, entry in filtered.iterrows()]
  ids = [('doc' + str(i)) for i in filtered.index]

  addDocuments(collection, documents, metadatas, ids)
  