In [1]:
import numpy as np

def consine_sim(matrix_a, matrix_b):
    dot_product = np.dot(matrix_a, matrix_b)
    norm_a = np.linalg.norm(matrix_a)
    norm_b = np.linalg.norm(matrix_b)
    if not norm_a or not norm_b:
        return 0
    return dot_product / (norm_a * norm_b)

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf_embedding = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  hf_embedding = HuggingFaceEmbeddings(


In [9]:
def hf_compare(model: HuggingFaceEmbeddings, left_word: str, right_word: str):
    results = model.embed_documents([left_word, right_word])
    consine = consine_sim(results[0], results[1])
    print(f"Similarity between [{left_word}] and [{right_word}] = [{consine}]")

In [11]:
hf_compare(hf_embedding, "dog", "หมา")

Similarity between [dog] and [หมา] = [0.8699827953438755]


In [12]:
hf_compare(hf_embedding, "What the dog saw?", "the dog is on the brige, looking at the river")

Similarity between [What the dog saw?] and [the dog is on the brige, looking at the river] = [0.7056555478007607]


In [10]:
hf_compare(hf_embedding, "What the dog saw?", "the cat is on the brige, looking at the river")

Similarity between [What the dog saw?] and [the cat is on the brige, looking at the river] = [0.58584010338901]


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
reranker_model_name = 'BAAI/bge-reranker-v2-m3'
tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name)
model.eval()

def hf_rerank_compare(left_word: str, right_word: str):
    with torch.no_grad():
        input = tokenizer([[left_word, right_word]],
                          padding=True,
                          truncation=True,
                          return_tensors='pt',
                          max_length=8194)
        scores = model(**input, return_dict=True).logits.view(-1, ).float()
        print(f"ReRank similarity between [{left_word}] and [{right_word}] = {scores.item()}")

In [18]:
hf_rerank_compare("dog", "หมา")

ReRank similarity between [dog] and [หมา] = 4.895456314086914


In [19]:
hf_rerank_compare("What the dog saw?", "the dog is on the brige, looking at the river")

ReRank similarity between [What the dog saw?] and [the dog is on the brige, looking at the river] = -2.162883996963501


In [25]:
hf_rerank_compare("What the dog saw?", "the dog is on the brige, it was looking at the river")

ReRank similarity between [What the dog saw?] and [the dog is on the brige, it was looking at the river] = -0.7420030832290649


In [20]:
hf_rerank_compare("What the dog saw?", "the cat is on the brige, looking at the river")

ReRank similarity between [What the dog saw?] and [the cat is on the brige, looking at the river] = -9.531991958618164


In [21]:
hf_rerank_compare("What the dog saw?", "the dog was surrounded by 4 sides of wall")

ReRank similarity between [What the dog saw?] and [the dog was surrounded by 4 sides of wall] = -3.717414140701294


In [22]:
hf_rerank_compare('what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.')

ReRank similarity between [what is panda?] and [The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.] = 5.265038967132568


In [23]:
hf_rerank_compare('what is panda?', 'Panda is an animal')

ReRank similarity between [what is panda?] and [Panda is an animal] = 5.520259857177734


In [24]:
hf_rerank_compare('what is panda?', 'PANDA is an ANIMAL')

ReRank similarity between [what is panda?] and [PANDA is an ANIMAL] = 5.4594407081604


In [26]:
hf_rerank_compare("คุณพูดภาษาไทยไหม?", "ฉันไม่พูดภาษาไทย")

ReRank similarity between [คุณพูดภาษาไทยไหม?] and [ฉันไม่พูดภาษาไทย] = -1.1652841567993164


In [30]:
hf_rerank_compare("คุณพูดภาษาไทยไหม?", "I can not speak Thai")

ReRank similarity between [คุณพูดภาษาไทยไหม?] and [I can not speak Thai] = -1.0544286966323853


In [29]:
hf_rerank_compare("คุณพูดภาษาไทยไหม?", "ฉันพูดภาษาไทย")

ReRank similarity between [คุณพูดภาษาไทยไหม?] and [ฉันพูดภาษาไทย] = 2.0975513458251953


In [28]:
hf_rerank_compare("สมบัติพูดภาษาไทยไหม?", "สมบัติสามารถพูดได้ 3 ภาษา คือ ไทย อังกฤษ และ ญี่ปุ่น")

ReRank similarity between [สมบัติพูดภาษาไทยไหม?] and [สมบัติสามารถพูดได้ 3 ภาษา คือ ไทย อังกฤษ และ ญี่ปุ่น] = 5.681005477905273
