In [1]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from transformers import EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load a model to finetune 
model = SentenceTransformer(
    "Alibaba-NLP/gte-multilingual-base",
    trust_remote_code=True,
)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
dataset = load_dataset("Dqdung205/medical_vietnamese_datasets", "default")
train_dataset = dataset["train"].select(range(200_000))

train_test_split = train_dataset.train_test_split(test_size=0.2, seed=42)
eval_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

train_dataset = train_test_split["train"]
eval_dataset = eval_test_split["train"]
test_dataset = eval_test_split["test"]

In [5]:
loss = MultipleNegativesRankingLoss(model)

In [None]:
args = SentenceTransformerTrainingArguments(
    output_dir="medical_vietnamese_embedding",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_ratio=0.05,
    fp16=False,
    bf16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,
    logging_steps=100,
    learning_rate=2e-5,
    run_name="vietnamese-medical_embedding",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [7]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  
)

                                                                     

In [8]:
trainer.train()

Step,Training Loss,Validation Loss
10,0.7368,0.747604
20,0.6287,0.589626
30,0.536,0.437063
40,0.3672,0.365455
50,0.3008,0.330246
60,0.2722,0.317412
70,0.2619,0.314767
80,0.2766,0.312743
90,0.2568,0.301095
100,0.2546,0.307617


TrainOutput(global_step=210, training_loss=0.29854955843516756, metrics={'train_runtime': 569.6147, 'train_samples_per_second': 280.892, 'train_steps_per_second': 4.389, 'total_flos': 0.0, 'train_loss': 0.29854955843516756, 'epoch': 0.084})

In [16]:
from sentence_transformers import SentenceTransformer

best_model = SentenceTransformer(
    "medical_vietnamese_embedding/checkpoint-180",
    trust_remote_code=True
)


In [17]:
!pip install -U -q sentence-transformers git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [25]:
model_ids = ["medical_vietnamese_embedding/checkpoint-210","google/embeddinggemma-300M","dangvantuan/vietnamese-document-embedding", "AITeamVN/Vietnamese_Embedding","keepitreal/vietnamese-sbert","Alibaba-NLP/gte-multilingual-base"]

In [27]:
models = []

# Load từng model trong model_ids
for i, model_id in enumerate(model_ids):
    print(f"Load model {model_id}....")
    model = SentenceTransformer(model_id, trust_remote_code=True).to(device=device)
    models.append(model)

Load model medical_vietnamese_embedding/checkpoint-210....
Load model google/embeddinggemma-300M....
Load model dangvantuan/vietnamese-document-embedding....
Load model AITeamVN/Vietnamese_Embedding....
Load model keepitreal/vietnamese-sbert....
Load model Alibaba-NLP/gte-multilingual-base....


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
from sentence_transformers import SentenceTransformer, util
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Các model cần so sánh
model_ids = [
    "medical_vietnamese_embedding/checkpoint-180",
    "google/embeddinggemma-300M",
    "dangvantuan/vietnamese-document-embedding",
    "AITeamVN/Vietnamese_Embedding",
    "keepitreal/vietnamese-sbert",
    "Alibaba-NLP/gte-multilingual-base"
]

# Load model
models = []
for i, model_id in enumerate(model_ids):
    print(f"Load model {model_id}....")
    model = SentenceTransformer(model_id, trust_remote_code=True).to(device)
    models.append(model)

Load model medical_vietnamese_embedding/checkpoint-180....
Load model google/embeddinggemma-300M....
Load model dangvantuan/vietnamese-document-embedding....
Load model AITeamVN/Vietnamese_Embedding....
Load model keepitreal/vietnamese-sbert....
Load model Alibaba-NLP/gte-multilingual-base....


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
# Bộ dữ liệu mini (query, correct_answer)
dataset = load_dataset("tarudesu/ViHealthQA", split="test[:1000]")

# Tạo query và corpus
queries = [item["question"] for item in dataset]
answers = [item["answer"] for item in dataset]
corpus = answers  # corpus = tất cả các câu trả lời

def compute_mrr_at_k(model, queries, answers, corpus, k):
    query_emb = model.encode(queries, convert_to_tensor=True, device=device)
    corpus_emb = model.encode(corpus, convert_to_tensor=True, device=device)

    reciprocal_ranks = []
    for i, q_emb in enumerate(query_emb):
        # Tính similarity query với toàn bộ corpus
        scores = util.cos_sim(q_emb, corpus_emb)[0]
        # Sắp xếp theo score giảm dần
        rankings = torch.argsort(scores, descending=True)
        # Lấy vị trí của câu trả lời đúng
        rank = (rankings == i).nonzero(as_tuple=True)[0].item() + 1  # index -> rank (1-based)

        if rank <= k:
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)

    return sum(reciprocal_ranks) / len(reciprocal_ranks)

# So sánh các model
for i, model in enumerate(models):
    mrr3 = compute_mrr_at_k(model, queries, answers, corpus, k=3)
    mrr5 = compute_mrr_at_k(model, queries, answers, corpus, k=5)
    mrr10 = compute_mrr_at_k(model, queries, answers, corpus, k=10)
    print(f"📊 Model: {model_ids[i]}")
    print(f"   - MRR@3  = {mrr3*100:.2f}%")
    print(f"   - MRR@5  = {mrr5*100:.2f}%")
    print(f"   - MRR@10 = {mrr10*100:.2f}%")


📊 Model: medical_vietnamese_embedding/checkpoint-180
   - MRR@3  = 70.15%
   - MRR@5  = 71.27%
   - MRR@10 = 71.91%
📊 Model: google/embeddinggemma-300M
   - MRR@3  = 65.62%
   - MRR@5  = 66.78%
   - MRR@10 = 67.60%
📊 Model: dangvantuan/vietnamese-document-embedding
   - MRR@3  = 66.57%
   - MRR@5  = 67.67%
   - MRR@10 = 68.36%
📊 Model: AITeamVN/Vietnamese_Embedding
   - MRR@3  = 65.28%
   - MRR@5  = 66.39%
   - MRR@10 = 67.04%
📊 Model: keepitreal/vietnamese-sbert
   - MRR@3  = 40.42%
   - MRR@5  = 41.88%
   - MRR@10 = 43.07%
📊 Model: Alibaba-NLP/gte-multilingual-base
   - MRR@3  = 67.35%
   - MRR@5  = 68.43%
   - MRR@10 = 69.14%
