In [None]:
from FlagEmbedding import BGEM3FlagModel


model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True,
                       cache_dir="C:/Users/User/Documents/Models"
                    ) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [2]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']
embeddings_1

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


array([[-0.03411696, -0.04707823, -0.00089446, ...,  0.04828526,
         0.00755427, -0.0296166 ],
       [-0.01041745, -0.04479254, -0.02429203, ..., -0.00819301,
         0.01504003,  0.01113799]], dtype=float32)

In [3]:
model.encode("hello")

{'dense_vecs': array([-0.03202485,  0.02325124, -0.04159372, ...,  0.01847981,
        -0.03722384,  0.05963244], dtype=float32),
 'lexical_weights': None,
 'colbert_vecs': None}

In [42]:
embeddings_2 = model.encode(sentences_2,
                            batch_size=1, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']

In [21]:
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.6259035  0.34749585]
 [0.3498679  0.6782462 ]]


### On real chunks

In [57]:
import sys
import os

sys.path.append(os.path.abspath(".."))
import connect_db

from importlib import reload

reload(connect_db)
ConnectDB = connect_db.ConnectDB

db = ConnectDB(db_path='../app_storage/metadata/sqlite-poc.db')
cursor = db.connection.cursor()

texts = cursor.execute("SELECT text FROM chunks limit 10").fetchall()
texts = [text[0] for text in texts]
texts[1]

'Multitask learning (Caruana, 1997) is a promising frame-\nwork for improving general performance. However, mul-\ntitask training in NLP is still nascent. Recent work re-\nports modest performance improvements (Yogatama et al.,\n2019) and the two most ambitious efforts to date have\ntrained on a total of 10 and 17 (dataset, objective)\npairs respectively (McCann et al., 2018) (Bowman et al.,\n2018). From a meta-learning perspective, each (dataset,\nobjective) pair is a single training example sampled\nfrom the distribution of datasets and objectives. Current\nML systems need hundreds to thousands of examples to\ninduce functions which generalize well. This suggests that\nmultitask training many need just as many effective training\npairs to realize its promise with current approaches. It will\nbe very difﬁcult to continue to scale the creation of datasets\nand the design of objectives to the degree that may be re-\nquired to brute force our way there with current techniques. This motiv

In [31]:
import time

start = time.time()
vecs = []


for i, text in enumerate(texts):
    print(i)
    vecs.append(model.encode(text, 
                            batch_size=12, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs'])

print((time.time() - start) / 10) 

0
1
2
3
4
5
6
7
8
9
27.88141210079193


In [58]:
import time

start = time.time()
vecs = model.encode(texts, 
                            batch_size=10, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']

print((time.time() - start)) 

66.49296450614929


In [47]:
import time

start = time.time()
vecs = model.encode(texts, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']

print((time.time() - start) / 10) 

AttributeError: module 'torch' has no attribute 'OutofMemoryError'

In [48]:
### With 64 instead

texts = cursor.execute("SELECT text FROM chunks limit 64").fetchall()
texts = [text[0] for text in texts]
start = time.time()
vecs = model.encode(texts, 
                            batch_size=64, 
                            max_length=2048, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']

print((time.time() - start) / 10) 

70.59645750522614


In [49]:
71 / 6.4

11.09375

In [59]:
split_texts = [text[i:i+100] for text in texts for i in range(0, len(text), 100)]
start = time.time()
vecs = model.encode(split_texts, 
                            batch_size=64, 
                            max_length=512, 
                            return_dense=True,
                            return_colbert_vecs=False,
                            return_sparse=False,
                            convert_to_numpy=True)['dense_vecs']

print((time.time() - start)) 

34.16237664222717
