# Creating text embedding models

## Contrastive learning
Embedding model classifies if 2 documents are similar or not

### Generate contrastive examples

In [1]:
# Set using only GPU 0 during training
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from datasets import load_dataset
# Load MNLI dataset from GLUE
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")
# 0=entailment; 1=neutral; 2=contradiction
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Train model

In [3]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('bert-base-uncased')

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [4]:
# loss function
from sentence_transformers import losses
train_loss = losses.SoftmaxLoss(
    model = embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [5]:
# to evaluate: we use STSB (Semantic Textual Similarity Benchmark)
# including many pairs of sentences with similarity scores in [1..5] range
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Dataset of <sentence1>,<sentence2>,<label>,<idx> columns
val_sts = load_dataset("glue", "stsb", split="validation") 

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [6]:
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [7]:
# define training arguments
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, # use 16bit precision
    eval_steps=100,
    logging_steps=100
)

In [8]:

# Start training
from sentence_transformers.trainer import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0811
200,0.9481
300,0.8859
400,0.8439
500,0.8297
600,0.8381
700,0.8171
800,0.803
900,0.7885
1000,0.7717


TrainOutput(global_step=1563, training_loss=0.8174091566089476, metrics={'train_runtime': 302.2167, 'train_samples_per_second': 165.444, 'train_steps_per_second': 5.172, 'total_flos': 0.0, 'train_loss': 0.8174091566089476, 'epoch': 1.0})

In [9]:
# evaluate embedding-model
# <pearson-cosine> means 3 steps
# 1/ calculate embedding of sentence1 and embedding of sentence 2
# 2/ calculate COSINE similarity between them => list of model-scores
# 3/ calculate PEARSON between model-scores and human scores (normalized)
evaluator(embedding_model)

{'pearson_cosine': 0.5011220437572054, 'spearman_cosine': 0.580632325967469}