In [None]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from chromadb import PersistentClient
import re

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
vector_size = 128
model_path = f'./models/w2v-{vector_size}.model'

model = Word2Vec.load(model_path)

In [None]:
def processString(text):
  text = re.sub(r'[^A-Za-z0-9]', '', text)
  return text[0:48]

In [None]:
def generateVectors(docs):
  embeddings = []

  for doc in docs:
    tokens = gensim.utils.simple_preprocess(doc)
    text_vector = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
    text_vector = list(np.array(text_vector, dtype=np.float64))
    embeddings.append(text_vector)
  
  return embeddings

In [None]:
client = PersistentClient('./client')

df = pd.read_csv("./data/inventory.csv")
categories = df['categoryName'].unique()

# categories

In [None]:
collection = client.get_or_create_collection('categories')

documents = list(categories)
embeddings = generateVectors(documents)
metadatas = [{ 'collectionName': processString(doc) } for doc in documents]
ids = [('cat' + str(i)) for i in range(len(categories))]

collection.add(
  documents=documents,
  embeddings=embeddings,
  metadatas=metadatas,
  ids=ids
)

In [30]:
def createEmbeddings(df):
  documents = []
  embeddings = []
  metadatas = []
  ids = []

  for index, entry in df.iterrows():
    entry = entry.to_dict()
    title = entry.pop('title')
    tokens = gensim.utils.simple_preprocess(title)
    
    text_vector = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
    
    
    if text_vector.size == vector_size:
      text_vector = list(np.array(text_vector, dtype=np.float64))
      embeddings.append(text_vector)

      documents.append(title)
      metadatas.append(entry)
      ids.append('item' + str(index))

  return documents, embeddings, metadatas, ids

In [17]:
def addDocuments(collection, documents, embeddings, metadatas, ids):
  collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
  )

In [18]:
def transformDocuments(collection, df):
  chunk_size = 40000

  for i in range(0, df.shape[0], chunk_size):
    documents, embeddings, metadatas, ids = createEmbeddings(df[i:i+chunk_size])

    addDocuments(collection, documents, embeddings, metadatas, ids)

In [31]:
for category in categories:
  mask = df['categoryName'] == category
  filtered = df[mask]

  filtered.drop(columns=['categoryName'], inplace=True)

  category = processString(category)
  collection = client.get_or_create_collection(f'{category}-{vector_size}')

  transformDocuments(collection, filtered)