In [5]:
import os
import time

import pandas as pd
import torch
import tqdm
import numpy as np

from sentence_transformers import SentenceTransformer

In [6]:
query_dev = pd.read_csv("./data/top1000/top1000.dev.inputs.tsv", header=None, sep="\t", index_col=0, names=["query"])
model = SentenceTransformer("all-MiniLM-L6-v2").cuda()

In [7]:
def do_embedding(model, series, batch_size=128):
  embeddings = []
  n = len(series)
  start = time.time()
  for i in tqdm.tqdm(range((n + (batch_size-1)) // batch_size)):
    a, b = batch_size*i, min(n, batch_size*(i+1))
    embedding_i = model.encode(series[a:b].tolist())
    embeddings.append(embedding_i)
  end = time.time()
  print(f"Time Taken: {(end-start):0.4f} s")
  return np.vstack(embeddings), end-start

In [8]:
query_embeddings, query_time = do_embedding(model, query_dev["query"], batch_size=128)
print(f"{query_time=}")
print(f"{query_embeddings.shape=}")
print(f"{query_embeddings.dtype=}")
output = "data/results/baseline"
os.makedirs(output, exist_ok=True)
np.save(os.path.join(output, "query_embeddings_6980.npy"), query_embeddings)

100%|██████████| 55/55 [00:01<00:00, 48.77it/s]


Time Taken: 1.1299 s
query_time=1.129906415939331
query_embeddings.shape=(6980, 384)
query_embeddings.dtype=dtype('float32')
