In [4]:
import math
import numpy as np
from scipy.stats import spearmanr
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from datasets import load_dataset

# 1. Load the STS-B dataset (English subset)
# stsb_multi_mt includes STS-B data in multiple languages; we choose 'en'
sts = load_dataset('stsb_multi_mt', 'en')
train_data = sts['train']
test_data = sts['test']

# STS-B similarity scores range from 0 to 5, we must scale them to [0,1] for CosineSimilarityLoss
def scale_score(score, min_val=0.0, max_val=5.0):
    return (score - min_val) / (max_val - min_val)

def to_input_examples(dataset_split):
    examples = []
    for item in dataset_split:
        # similarity_score in [0,5]
        score = float(item['similarity_score'])
        scaled_score = scale_score(score)  # scale to [0,1]
        examples.append(InputExample(texts=[item['sentence1'], item['sentence2']], label=scaled_score))
    return examples

train_examples = to_input_examples(train_data)
test_examples = to_input_examples(test_data)

# 2. Load a pre-trained model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

def evaluate_model(model, examples):
    # Evaluate Spearman correlation between model cos_sim and gold scores
    s1 = [ex.texts[0] for ex in examples]
    s2 = [ex.texts[1] for ex in examples]
    gold_scores = [ex.label for ex in examples]  # these are in [0,1]

    emb1 = model.encode(s1, convert_to_tensor=True)
    emb2 = model.encode(s2, convert_to_tensor=True)
    cos_scores = util.cos_sim(emb1, emb2).cpu().numpy()

    # Extract the diagonal since we compared each pair (i,i)
    cos_scores = np.array([cos_scores[i][i] for i in range(len(gold_scores))])

    # Compute Spearman correlation
    spearman_corr = spearmanr(gold_scores, cos_scores).correlation
    return spearman_corr

# Evaluate before fine-tuning
before_corr = evaluate_model(model, test_examples)
print("Before Fine-Tuning - Spearman Correlation on STS-B Test:", before_corr)

# 3. Fine-tune the model
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model=model)  # Align cos_sim with labels in [0,1]

epochs = 1 
warmup_steps = math.ceil(len(train_dataloader)*epochs*0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True
)

# 4. Evaluate after fine-tuning
after_corr = evaluate_model(model, test_examples)
print("After Fine-Tuning - Spearman Correlation on STS-B Test:", after_corr)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Before Fine-Tuning - Spearman Correlation on STS-B Test: 0.8203246731235654


Step,Training Loss


After Fine-Tuning - Spearman Correlation on STS-B Test: 0.8489561516175831
