# Creating text embedding models

## Contrastive learning
Embedding model classifies if 2 documents are similar or not

### Generate contrastive examples

In [17]:
# Set using only GPU 0 during training
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [18]:
from datasets import load_dataset
# Load MNLI dataset from GLUE
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")
# 0=entailment; 1=neutral; 2=contradiction
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Train model

In [19]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('google-bert/bert-base-uncased')

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.


In [20]:
# loss function
from sentence_transformers import losses
train_loss = losses.SoftmaxLoss(
    model = embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [21]:
# to evaluate: we use STSB (Semantic Textual Similarity Benchmark)
# including many pairs of sentences with similarity scores in [1..5] range
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Dataset of <sentence1>,<sentence2>,<label>,<idx> columns
val_sts = load_dataset("glue", "stsb", split="validation") 

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [22]:
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [23]:
# define training arguments
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, # use 16bit precision
    eval_steps=100,
    logging_steps=100
)

In [24]:

# Start training
from sentence_transformers.trainer import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0807
200,0.9494
300,0.8959
400,0.8511
500,0.8294
600,0.8286
700,0.8186
800,0.7982
900,0.7808
1000,0.7712


TrainOutput(global_step=1563, training_loss=0.815764824396818, metrics={'train_runtime': 301.414, 'train_samples_per_second': 165.885, 'train_steps_per_second': 5.186, 'total_flos': 0.0, 'train_loss': 0.815764824396818, 'epoch': 1.0})

In [25]:
# evaluate embedding-model
# <pearson-cosine> means 3 steps
# 1/ calculate embedding of sentence1 and embedding of sentence 2
# 2/ calculate COSINE similarity between them => list of model-scores
# 3/ calculate PEARSON between model-scores and human scores (normalized)
evaluator(embedding_model)

{'pearson_cosine': 0.5588098432219584, 'spearman_cosine': 0.6210753592157163}

### Evaluate with MTEB (Massive Text Embedding Benchmark)

In [30]:
# Choose another evaluation task to check our previous model
from mteb import evaluate
from mteb.tasks import Banking77Classification

results = evaluate(
    model=embedding_model,
    tasks=[Banking77Classification()]
)

Model library not recognized, defaulting to Sentence Transformers loader.
Evaluating task Banking77Classification: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 794.07it/s]


In [37]:
results.task_results[0]


TaskResult(task_name=Banking77Classification, scores=...)

## Loss functions

### Cosine similarity

In [39]:
from datasets import Dataset, load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load MNLI dataset from GLUE
# 0=similar; 1=neutral; 2=contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")

# Change similar=>1; contradiction / neutral=>0
mapping = {2:0, 1:0, 0:1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2": train_dataset["hypothesis"],
    "label": [float(mapping[label]) for label in train_dataset["label"]]
})

# Create evaluator on val dataset
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]] # similarity:[0..1]
)

In [41]:
# select a different loss function
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("google-bert/bert-base-uncased")

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.
                                                                                                                                                               

Step,Training Loss
100,0.2301
200,0.1704
300,0.1708
400,0.1609
500,0.1533
600,0.1589
700,0.1494
800,0.1575
900,0.1482
1000,0.1471


TrainOutput(global_step=1563, training_loss=0.15723129792314117, metrics={'train_runtime': 307.0011, 'train_samples_per_second': 162.866, 'train_steps_per_second': 5.091, 'total_flos': 0.0, 'train_loss': 0.15723129792314117, 'epoch': 1.0})

In [42]:
# Evaluate - label [0..1] but float (train is binary 0 or 1)
evaluator(embedding_model)

{'pearson_cosine': 0.7250536218796055, 'spearman_cosine': 0.7270930231269686}

### MNR (Multiple Negatives Ranking) loss
Definition: loss which uses a question as "premise" (anchor) \
-> pairing a related answer as positive \
-> pairing an unrelated answer as negative

In [51]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

mnli = load_dataset("glue", "mnli", split="train").select(range(50000))
mnli = mnli.remove_columns("idx")

# original label: 0=similar; 1=neutral; 2=contradiction 
# => we keep only similar pairs
mnli = mnli.filter(lambda x:True if x["label"]==0 else False)

In [52]:
# prepare train data
train_dataset = {"anchor":[], "positive":[], "negative":[]}
soft_negatives = list(mnli["hypothesis"]) # all 2nd sentences
random.shuffle(soft_negatives)

# similar pairs are "positive"; otherwise another sentence would be "negative"
for row, soft_negatives in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negatives)
train_dataset = Dataset.from_dict(train_dataset)

16875it [00:00, 38341.88it/s]


In [46]:
print(mnli)

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 16875
})
