# Creating text embedding models

## Contrastive learning
Embedding model classifies if 2 documents are similar or not

### Generate contrastive examples

In [10]:
# Set using only GPU 0 during training
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [18]:
from datasets import load_dataset
# Load MNLI dataset from GLUE
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")
# 0=entailment; 1=neutral; 2=contradiction
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Train model

In [19]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('google-bert/bert-base-uncased')

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.


In [20]:
# loss function
from sentence_transformers import losses
train_loss = losses.SoftmaxLoss(
    model = embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [21]:
# to evaluate: we use STSB (Semantic Textual Similarity Benchmark)
# including many pairs of sentences with similarity scores in [1..5] range
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Dataset of <sentence1>,<sentence2>,<label>,<idx> columns
val_sts = load_dataset("glue", "stsb", split="validation") 

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [22]:
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [23]:
# define training arguments
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, # use 16bit precision
    eval_steps=100,
    logging_steps=100
)

In [24]:

# Start training
from sentence_transformers.trainer import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0807
200,0.9494
300,0.8959
400,0.8511
500,0.8294
600,0.8286
700,0.8186
800,0.7982
900,0.7808
1000,0.7712


TrainOutput(global_step=1563, training_loss=0.815764824396818, metrics={'train_runtime': 301.414, 'train_samples_per_second': 165.885, 'train_steps_per_second': 5.186, 'total_flos': 0.0, 'train_loss': 0.815764824396818, 'epoch': 1.0})

In [25]:
# evaluate embedding-model
# <pearson-cosine> means 3 steps
# 1/ calculate embedding of sentence1 and embedding of sentence 2
# 2/ calculate COSINE similarity between them => list of model-scores
# 3/ calculate PEARSON between model-scores and human scores (normalized)
evaluator(embedding_model)

{'pearson_cosine': 0.5588098432219584, 'spearman_cosine': 0.6210753592157163}

### Evaluate with MTEB (Massive Text Embedding Benchmark)

In [30]:
# Choose another evaluation task to check our previous model
from mteb import evaluate
from mteb.tasks import Banking77Classification

results = evaluate(
    model=embedding_model,
    tasks=[Banking77Classification()]
)

Model library not recognized, defaulting to Sentence Transformers loader.
Evaluating task Banking77Classification: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 794.07it/s]


In [37]:
results.task_results[0]


TaskResult(task_name=Banking77Classification, scores=...)

## Loss functions

### Cosine similarity

In [3]:
from datasets import Dataset, load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load MNLI dataset from GLUE
# 0=similar; 1=neutral; 2=contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")

# Change similar=>1; contradiction / neutral=>0
mapping = {2:0, 1:0, 0:1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2": train_dataset["hypothesis"],
    "label": [float(mapping[label]) for label in train_dataset["label"]]
})

# Create evaluator on val dataset
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]] # similarity:[0..1]
)

In [41]:
# select a different loss function
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("google-bert/bert-base-uncased")

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.
                                                                                                                                                               

Step,Training Loss
100,0.2301
200,0.1704
300,0.1708
400,0.1609
500,0.1533
600,0.1589
700,0.1494
800,0.1575
900,0.1482
1000,0.1471


TrainOutput(global_step=1563, training_loss=0.15723129792314117, metrics={'train_runtime': 307.0011, 'train_samples_per_second': 162.866, 'train_steps_per_second': 5.091, 'total_flos': 0.0, 'train_loss': 0.15723129792314117, 'epoch': 1.0})

In [42]:
# Evaluate - label [0..1] but float (train is binary 0 or 1)
evaluator(embedding_model)

{'pearson_cosine': 0.7250536218796055, 'spearman_cosine': 0.7270930231269686}

### MNR (Multiple Negatives Ranking) loss
Definition: loss which uses a question as "premise" (anchor) \
-> pairing a related answer as positive \
-> pairing an unrelated answer as negative

In [51]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

mnli = load_dataset("glue", "mnli", split="train").select(range(50000))
mnli = mnli.remove_columns("idx")

# original label: 0=similar; 1=neutral; 2=contradiction 
# => we keep only similar pairs
mnli = mnli.filter(lambda x:True if x["label"]==0 else False)

In [52]:
# prepare train data
train_dataset = {"anchor":[], "positive":[], "negative":[]}
soft_negatives = list(mnli["hypothesis"]) # all 2nd sentences
random.shuffle(soft_negatives)

# similar pairs are "positive"; otherwise another sentence would be "negative"
for row, soft_negatives in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negatives)
train_dataset = Dataset.from_dict(train_dataset)

16875it [00:00, 38341.88it/s]


In [4]:
# Define evaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [10]:
# Train with MNR loss 
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("google-bert/bert-base-uncased")
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)
args = SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.4546
200,0.1692
300,0.1614
400,0.1522
500,0.1436
600,0.1308
700,0.146
800,0.1157
900,0.1188
1000,0.1313


TrainOutput(global_step=1563, training_loss=0.15112468605993348, metrics={'train_runtime': 302.8374, 'train_samples_per_second': 165.105, 'train_steps_per_second': 5.161, 'total_flos': 0.0, 'train_loss': 0.15112468605993348, 'epoch': 1.0})

In [11]:
# Evaluate model 
evaluator(embedding_model)

{'pearson_cosine': 0.7589502181894794, 'spearman_cosine': 0.7696032161647742}

## Fine tuning an Embedding Model

### Supervised 

In [12]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# 0=entailment; 1=neutral; 2=contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50000))
train_dataset = train_dataset.remove_columns("idx")

In [14]:
# evaluator on validation dataset
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [15]:
# Training on PRETRAINED model
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)
args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)
trainer = SentenceTransformerTrainer(
    model=embedding_model, # pre-trained
    args=args,
    train_dataset=train_dataset, # 3 cols <anchor>,<positive>,<negative>
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1552
200,0.1136
300,0.1196
400,0.1127
500,0.1111
600,0.0995
700,0.1178
800,0.1023
900,0.1011
1000,0.1007


TrainOutput(global_step=1563, training_loss=0.10931369942575407, metrics={'train_runtime': 143.8875, 'train_samples_per_second': 347.494, 'train_steps_per_second': 10.863, 'total_flos': 0.0, 'train_loss': 0.10931369942575407, 'epoch': 1.0})

In [16]:
# evaluate
evaluator(embedding_model)

{'pearson_cosine': 0.8501746962964003, 'spearman_cosine': 0.8492437746842924}

### Augmented SBERT
1/ Fine-tune cross-encoder (BERT) with small labeled dataset (gold dataset) \
2/ Use BERT to infer more unlabeled data => bigger dataset (silver dataset) \
3/ Train bi-encoder SBERT on gold- and silver-dataset

In [19]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

# Prepare gold dataset
dataset = load_dataset("glue", "mnli", split="train").select(range(10000)) # 2=contradict; 1=neutral; 0=similar
mapping = {2:0, 1:0, 0:1} # map to => similar:1; non-similar:0
gold_examples = [
    InputExample(texts=[row["premise"], row["hypothesis"]], label=mapping[row["label"]]) # sentence1, sentence2, label (0 or 1)
    for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)
gold = pd.DataFrame(
    {
        "sentence1": dataset["premise"],
        "sentence2": dataset["hypothesis"],
        "label": [mapping[label] for label in dataset["label"]]
    }
)

100%|██████████████████████████████████| 10000/10000 [00:00<00:00, 14791.98it/s]


In [20]:
# Train cross-encoder SBERT on gold dataset => get fine-tuned cross-encoder
from sentence_transformers.cross_encoder import CrossEncoder
cross_encoder = CrossEncoder("google-bert/bert-base-uncased", num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md: 0.00B [00:00, ?B/s]

Step,Training Loss


In [22]:
# prepare silver dataset
silver = load_dataset(
    "glue", "mnli", split="train"
).select(range(10000, 50000))
pairs = list(zip(silver["premise"], silver["hypothesis"])) # list of pairs (UNLABELED); each has 2 sentences

In [23]:
# use fine-tuned-cross-encoder to label those silver pairs
import numpy as np
output = cross_encoder.predict(
    pairs, apply_softmax=True, show_progress_bar=True
)
silver = pd.DataFrame(
    { "sentence1": silver["premise"], "sentence2": silver["hypothesis"], "label": np.argmax(output, axis=1) }
)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

In [26]:
# combine <gold> and <silver> to train bi-encoder SBERT
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=["sentence1", "sentence2"], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)
train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 49998
})

In [28]:
# create evaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [30]:
# train with dataset of 20% labeled and 80% enhanced
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("google-bert/bert-base-uncased")
train_loss = losses.CosineSimilarityLoss(model=embedding_model)
args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2143
200,0.155
300,0.1428
400,0.143
500,0.1379
600,0.1341
700,0.1329
800,0.1296
900,0.1318
1000,0.1283


TrainOutput(global_step=1563, training_loss=0.1387178585152556, metrics={'train_runtime': 300.0072, 'train_samples_per_second': 166.656, 'train_steps_per_second': 5.21, 'total_flos': 0.0, 'train_loss': 0.1387178585152556, 'epoch': 1.0})

In [31]:
evaluator(embedding_model)

{'pearson_cosine': 0.6651888924374882, 'spearman_cosine': 0.6859290018366759}

### Unsupervised: TSDAE (Transformer-based Sequenial Denoising Auto-Encoder)
Steps: \
1/ remove random words from a sentence => "damaged-sentence" \
2/ encoder => convert "damaged-sentence" to "sentence-embedding" \
3/ decoder => convert "sentence-embedding" to "original sentence" \
After training ENCODER is ready to use

In [1]:
# Download tokenizer
import nltk 
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/phm1605/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Prepare data & "damaged-data"
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

mnli = load_dataset("glue", "mnli", split="train").select(range(25000))
flat_sentences = list(mnli["premise"]) + list(mnli["hypothesis"])
damaged_data = DenoisingAutoEncoderDataset(flat_sentences)

In [6]:
# Create dataset 
train_dataset = {"damaged_sentence": [], "original_sentence": []}
for data in tqdm(damaged_data):
    train_dataset["damaged_sentence"].append(data.texts[0])
    train_dataset["original_sentence"].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)
train_dataset

100%|███████████████████████████████████| 50000/50000 [00:10<00:00, 4740.72it/s]


Dataset({
    features: ['damaged_sentence', 'original_sentence'],
    num_rows: 50000
})

In [7]:
# Create embedding similarity evaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [11]:
# Create model with "cls-pooling" layer at output
from sentence_transformers import models, SentenceTransformer

word_embedding_model = models.Transformer("google-bert/bert-base-uncased") # (batch,seq_len,embed_dim)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls") # (batch,embed_dim)
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [12]:
# Create loss function, which automatically:
# 1/ encode damaged sentence => create embed => decode to get "pred_sentence"
# 2/ calculate loss between "pred_sentence" and "original sentence"
from sentence_transformers import losses 
train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model,
    tie_encoder_decoder=True # both "encoder" and "decoder" use same weights
)
train_loss.decoder = train_loss.decoder.to("cuda")

Some weights of BertLMHeadModel were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bi

In [14]:
# training our model
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,6.7217
200,4.8759
300,4.5782
400,4.4341
500,4.3434
600,4.3032
700,4.2203
800,4.1787
900,4.0595
1000,4.0091


TrainOutput(global_step=3125, training_loss=3.99793435546875, metrics={'train_runtime': 630.4669, 'train_samples_per_second': 79.306, 'train_steps_per_second': 4.957, 'total_flos': 0.0, 'train_loss': 3.99793435546875, 'epoch': 1.0})

In [15]:
# evaluate the encoder only
evaluator(embedding_model)

{'pearson_cosine': 0.7367198422182759, 'spearman_cosine': 0.7433935806395026}