In [1]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

### 1. Load a model to finetune with 2. (Optional) model card data

In [2]:
model = SentenceTransformer(
    "BAAI/bge-m3",
)

### 2-3. Load a dataset to finetune on

In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={
    "train": "dataset/train_dataset.csv", 
    "validation": "dataset/validation_dataset.csv",
    "test": "dataset/test_dataset.csv"
})

train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

### 4. Define a loss function

In [4]:
loss = CosineSimilarityLoss(model)

### 5. Specify training arguments

In [5]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/bge-m3",
    # Optional training parameters:
    num_train_epochs=20,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="bge-m3-cosine",  # Will be used in W&B if `wandb` is installed
)

### 6. Create an evaluator & evaluate the base model

In [6]:
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["question"],
    sentences2=eval_dataset["answer_data"],
    scores=eval_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
dev_evaluator(model)

### 7. Create a trainer & train

In [7]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

### 8. Evaluate the trained model on the test set

In [8]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["question"],
    sentences2=test_dataset["answer_data"],
    scores=test_dataset["label"],
    name="sts-test",
)
test_evaluator(model)

### 9. Save the trained model

In [9]:
from datetime import datetime

current_time = datetime.now().strftime("%d_%H-%M-%S")

model.save_pretrained(f"./models/bge-m3_{current_time}")