In [1]:
from sentence_transformers import SentenceTransformer
import time
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('output/embeddings_all-MiniLM-L6-v2.json', 'r', encoding='utf-8') as f:
    data_minilm = json.load(f)

with open('output/embeddings_distilbert-base-nli-stsb-mean-tokens.json', 'r', encoding='utf-8') as f:
    data_distilbert = json.load(f)

In [3]:
# get the embeddings
e_minilm = np.array(data_minilm["embeddings"])
e_distilbert = np.array(data_distilbert["embeddings"])

print(f"e_minilm shape: {e_minilm.shape}")
print(f"distilbert shape: {e_distilbert.shape}")

e_minilm shape: (15, 384)
distilbert shape: (15, 768)


In [None]:
models = {
    "MiniLM": SentenceTransformer('all-MiniLM-L6-v2'),
    "DistilBERT": SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
}

sample = ["This is a sample sentence."] * 1000

for name, model in models.items():
    start_time = time.time()
    embeddings = model.encode(sample, show_progress_bar=True)
    elapsed_time = time.time() - start_time
    print(f"{name} took {elapsed_time:.4f} seconds to generate embeddings.")


Batches: 100%|██████████| 32/32 [00:09<00:00,  3.52it/s]


MiniLM took 9.1424 seconds to generate embeddings.


Batches: 100%|██████████| 32/32 [00:09<00:00,  3.44it/s]

DistilBERT took 9.3550 seconds to generate embeddings.





In [5]:
minilm_memory = e_minilm.nbytes / 1e6  # convert to MB
distilbert_memory = e_distilbert.nbytes / 1e6 

print(f"MiniLM Embedding Memory Usage: {minilm_memory:.2f} MB")
print(f"DistilBERT Embedding Memory Usage: {distilbert_memory:.2f} MB")

MiniLM Embedding Memory Usage: 0.05 MB
DistilBERT Embedding Memory Usage: 0.09 MB


Summary:
1. DistilBERT embeddings are larger (768) compared to MiniLM (384)
2. MiniLM is faster due to smaller dimensionality
3. MiniLM embeddings consume less memory, making them more suitable for larger datasets.