# STS 微调实战 — CosineSimilarityLoss

官方示例：https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/training/sts/training_stsbenchmark.py

使用 STS Benchmark 数据集微调模型，数据格式：(sentence1, sentence2, similarity_score)

In [None]:
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from datasets import load_dataset

In [None]:
# 1. 加载基础模型
model = SentenceTransformer("distilbert-base-uncased")
print(model)

In [None]:
# 2. 加载 STS Benchmark 数据集
train_dataset = load_dataset("sentence-transformers/stsb", split="train")
eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
test_dataset = load_dataset("sentence-transformers/stsb", split="test")

print(f"训练集: {len(train_dataset)} 条")
print(f"验证集: {len(eval_dataset)} 条")
print(f"测试集: {len(test_dataset)} 条")
print("样本:", train_dataset[0])

In [None]:
# 3. 损失函数: CosineSimilarityLoss 适合 (sent1, sent2, score) 格式
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
# 4. 评估器
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
# 训练前评估作为 baseline
print("训练前评估:")
dev_evaluator(model)

In [None]:
# 5. 训练参数
args = SentenceTransformerTrainingArguments(
    output_dir="output/sts-distilbert",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [None]:
# 6. 创建 Trainer 并训练
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

In [None]:
# 7. 测试集评估
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model)

In [None]:
# 8. 保存模型
model.save("output/sts-distilbert/final")