In [10]:
import pandas as pd
import tiktoken
import os
import numpy as np
import torch
from sentence_transformers.util import semantic_search

from dotenv import load_dotenv
load_dotenv()

from utils.embeddings_utils import get_embedding


In [11]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 1000  # the maximum for text-embedding-3-small is 8191

In [12]:
input_datapath = "data/fine_food_reviews_1k.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...


In [13]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)


1000

In [14]:
a = get_embedding("Decaf coffee", model=embedding_model)

In [15]:

len(a)

1536

In [16]:
csv_file = 'data/fine_food_reviews_with_embeddings_1k.csv'

# Define the data types for each column
dtypes = {
    "index":int,
    "ProductId": str,
    "UserId": str,
    "Score": int,
    "Summary": str,
    "Combined": str,
    "n_tokens": int,
    "embedding": object # Assuming embedding is stored as a string
}

# Load the CSV file with specified data types
data = pd.read_csv(csv_file, dtype=dtypes)

# Convert string representations of vectors to numeric arrays
data_embaddings = data['embedding'].apply(lambda x: np.array(eval(x),dtype=np.float64), 0)
test =data_embaddings.to_numpy()
test=np.vstack(test).astype(np.float64)
test.shape

  data_embaddings = data['embedding'].apply(lambda x: np.array(eval(x),dtype=np.float64), 0)


(1000, 1536)

In [17]:
query_embedding = torch.FloatTensor(a)
query_embeddings = torch.from_numpy(test).to(torch.float)

In [18]:
hits = semantic_search(query_embedding, query_embeddings, top_k=1)
hits

[[{'corpus_id': 24, 'score': 0.6052955389022827}]]