# Embedding Models

In [1]:
from datasets import load_dataset

#load MNLI Dataset from Glue
# 0- entailment, 1- contradiction, 2- neutral

train_dataset= load_dataset("glue", "mnli", split="train").select(range(50000))

train_dataset = train_dataset.remove_columns("idx")

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

In [2]:
train_dataset[25]

{'premise': "and it's it's quite a bit i think six something is the state and and uh the rest of the pie goes elsewhere but we're in a particular part of the state that's pretty well off so it's it's like we get a lot of that back as far as local taxation goes",
 'hypothesis': 'I do not know exactly where the local taxes go.',
 'label': 1}

### train model

Now that we have our dataset with training examples, we will need to
 create our embedding model. We typically choose an existing sentence
transformers model and fine-tune that model, but in this example, we
 are going to train an embedding from scratch.
 This means that we will have to define two things. First, a pretrained
 Transformer model that serves as embedding individual words. We will use
 the BERT base model (uncased) as it is a great introduction model.
 However, many others exist that also have been evaluated using
 sentence-transformers. Most notably, microsoft/mpnet
base often gives good results when used as a word embedding model.

In [3]:
from sentence_transformers import SentenceTransformer
#use a base model
embedding_model = SentenceTransformer('bert-base-uncased')




In [4]:

from sentence_transformers import losses

train_loss= losses.SoftmaxLoss(
    model=embedding_model,

    sentence_embedding_dimension= embedding_model.get_sentence_embedding_dimension(),
    num_labels= 3 # 0- entailment, 1- contradiction, 2- neutral
)

In [5]:
#for evaluation we will use Semantic Textual Similarity Benchmark (STSB)

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

#create an embedding similarity evaluator for STSB

val_sts= load_dataset("glue", "stsb", split="validation")

evaluator= EmbeddingSimilarityEvaluator(
    sentences1= val_sts["sentence1"],
    sentences2= val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity= "cosine",
)

In [6]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

#define the training args

args= SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs= 1,
    per_device_eval_batch_size=32,
    per_gpu_eval_batch_size=32,
    warmup_steps= 100,
    fp16=True,
    logging_steps= 100,
    eval_steps= 100
)


In [7]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

#define the training args

args= SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs= 1,
    per_device_eval_batch_size=32,
    per_gpu_eval_batch_size=32,
    warmup_steps= 100,
    fp16=True,
    logging_steps= 100,
    eval_steps= 100,
    report_to="none" # Disable WandB integration
)

In [8]:
evaluator(embedding_model)

{'pearson_cosine': np.float64(0.5917194487413572),
 'spearman_cosine': np.float64(0.5931742011707938)}

In [12]:
#we will use another benchmark here mteb(massive text embedding benchmark)

from mteb import MTEB
#choose evaluation task

evaluation=MTEB(tasks= ["Banking77Classification"])

#results
evaluation.run(embedding_model)



[TaskResult(task_name=Banking77Classification, scores=...)]