# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [1]:
import time
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 1. Data

### Train, Test, Validation 

In [2]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

  from .autonotebook import tqdm as notebook_tqdm


({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [3]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [4]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [5]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [6]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [7]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

mnli = mnli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

Filter: 100%|██████████| 392702/392702 [00:01<00:00, 346604.61 examples/s]
Filter: 100%|██████████| 9815/9815 [00:00<00:00, 272656.85 examples/s]
Filter: 100%|██████████| 9832/9832 [00:00<00:00, 223453.79 examples/s]
Filter: 100%|██████████| 9796/9796 [00:00<00:00, 284003.83 examples/s]
Filter: 100%|██████████| 9847/9847 [00:00<00:00, 307752.52 examples/s]


In [8]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [9]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(50000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(1000))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(10000)))
})
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

## 2. Preprocessing

In [10]:
import torchtext
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
vocab = torch.load('./model/vocab')

In [11]:
max_seq_length = 512

def preprocess_function(examples):
    
    # Tokenize the premise
    tokenized_premise = [tokenizer(re.sub("[.,!?\\-]", '', sent.lower())) for sent in examples['premise']]
    premise_input_ids = [[vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']] for tokens in tokenized_premise]
    premise_n_pad = [max_seq_length - len(tokens) for tokens in premise_input_ids]
    premise_attn_mask = [([1] * len(tokens)) + ([0] * n_pad) for tokens, n_pad in zip(premise_input_ids, premise_n_pad)]
    premise_input_ids = [tokens + ([0] * n_pad) for tokens, n_pad in zip(premise_input_ids, premise_n_pad)]
    #num_rows, max_seq_length

    # Tokenize the hypothesis
    tokenized_hypothesis = [tokenizer(re.sub("[.,!?\\-]", '', sent.lower())) for sent in examples['hypothesis']]
    hypothesis_input_ids = [[vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']] for tokens in tokenized_hypothesis]
    hypothesis_n_pad = [max_seq_length - len(tokens) for tokens in hypothesis_input_ids]
    hypothesis_attn_mask = [([1] * len(tokens)) + ([0] * n_pad) for tokens, n_pad in zip(hypothesis_input_ids, hypothesis_n_pad)]
    hypothesis_input_ids = [tokens + ([0] * n_pad) for tokens, n_pad in zip(hypothesis_input_ids, hypothesis_n_pad)]
    #num_rows, max_seq_length
    
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_input_ids,
        "premise_attention_mask": premise_attn_mask,
        "hypothesis_input_ids": hypothesis_input_ids,
        "hypothesis_attention_mask": hypothesis_attn_mask,
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

Map: 100%|██████████| 50000/50000 [00:11<00:00, 4263.84 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3787.88 examples/s]
Map: 100%|██████████| 10000/10000 [00:02<00:00, 4119.74 examples/s]


In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 10000
    })
})

## 3. Data loader

In [13]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 16
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [14]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16])


## 4. Model

In [15]:
from bert import *

# load the model and all its hyperparameters
load_path = './model/bert.pt'
params, state = torch.load(load_path)
model = BERT(**params, device=device).to(device)
model.load_state_dict(state)

<All keys matched successfully>

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [16]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

In [17]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [18]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [19]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [20]:
from tqdm.auto import tqdm

def train(model, classifier_head, data, optimizer, optimizer_classifier, scheduler, scheduler_classifier, criterion, device):
    epoch_loss = []
    model.train()
    classifier_head.train()

    for step, batch in enumerate(tqdm(data, leave=True, desc='Training: ')):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)  # each input contains only one sentence hence we define them all as sentence '0'
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u_last_hidden_state = model.get_last_hidden_state(inputs_ids_a, segment_ids)  
        v_last_hidden_state = model.get_last_hidden_state(inputs_ids_b, segment_ids)  

        # u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        # v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        epoch_loss.append(loss.item())
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()

    return np.mean(epoch_loss)

In [21]:
def evaluate(model, classifier_head, data, criterion, device):
    epoch_loss = []
    model.eval()
    classifier_head.eval()

    with torch.no_grad():
        for step, batch in enumerate(tqdm(data, leave=True, desc='Evaluate: ')):
            
            # prepare batches and more all to the active device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            attention_a = batch['premise_attention_mask'].to(device)
            attention_b = batch['hypothesis_attention_mask'].to(device)
            segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)  # each input contains only one sentence hence we define them all as sentence '0'
            label = batch['labels'].to(device)
            
            # extract token embeddings from BERT at last_hidden_state
            u_last_hidden_state = model.get_last_hidden_state(inputs_ids_a, segment_ids)  
            v_last_hidden_state = model.get_last_hidden_state(inputs_ids_b, segment_ids)  

            # u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
            # v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

            # get the mean pooled vectors
            u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
            
            # build the |u-v| tensor
            uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
            uv_abs = torch.abs(uv) # batch_size,hidden_dim
            
            # concatenate u, v, |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
            
            # process concatenated tensor through classifier_head
            x = classifier_head(x) #batch_size, classifer
            
            # calculate the 'softmax-loss' between predicted and true label
            loss = criterion(x, label)
            epoch_loss.append(loss.item())

    return np.mean(epoch_loss)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
head_path = './model/s-bert-classifier-head.pt'
model_path = './model/s-bert.pt'

In [24]:
num_epoch = 5

best_val_loss = float('inf')
train_losses = []
val_losses = []

# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    start_time = time.time()
    train_loss = train(model, classifier_head, train_dataloader, optimizer, optimizer_classifier, scheduler, scheduler_classifier, criterion, device)
    val_loss = evaluate(model, classifier_head, eval_dataloader, criterion, device)

    #for plotting
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # save the model only when its validation loss is lower than all its predecessors
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(classifier_head, head_path)  # save the classifier head
        torch.save([model.params, model.state_dict()], model_path)  # save the model's parameters and state to a file
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {val_loss:.3f}')

Training: 100%|██████████| 3125/3125 [17:12<00:00,  3.03it/s]
Evaluate: 100%|██████████| 625/625 [01:19<00:00,  7.89it/s]


Epoch: 01 | Time: 18m 31s
	Train Loss: 1.127
	 Val. Loss: 1.125


Training: 100%|██████████| 3125/3125 [16:58<00:00,  3.07it/s]
Evaluate: 100%|██████████| 625/625 [01:18<00:00,  7.93it/s]


Epoch: 02 | Time: 18m 17s
	Train Loss: 1.127
	 Val. Loss: 1.124


Training: 100%|██████████| 3125/3125 [16:55<00:00,  3.08it/s]
Evaluate: 100%|██████████| 625/625 [01:18<00:00,  7.91it/s]


Epoch: 03 | Time: 18m 14s
	Train Loss: 1.127
	 Val. Loss: 1.125


Training: 100%|██████████| 3125/3125 [16:59<00:00,  3.07it/s]
Evaluate: 100%|██████████| 625/625 [01:19<00:00,  7.88it/s]


Epoch: 04 | Time: 18m 18s
	Train Loss: 1.127
	 Val. Loss: 1.124


Training: 100%|██████████| 3125/3125 [17:04<00:00,  3.05it/s]
Evaluate: 100%|██████████| 625/625 [01:21<00:00,  7.70it/s]


Epoch: 05 | Time: 18m 25s
	Train Loss: 1.126
	 Val. Loss: 1.123


In [25]:
model.eval()
classifier_head.eval()
total_similarity = 0
with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model.get_last_hidden_state(inputs_ids_a, segment_ids)  # all token embeddings A = batch_size, seq_len, hidden_dim
        v = model.get_last_hidden_state(inputs_ids_b, segment_ids)  # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 0.9720


## 7. Inference

In [24]:
# load the model and all its hyperparameters
params, state = torch.load(model_path)
model = BERT(**params, device=device).to(device)
model.load_state_dict(state)

<All keys matched successfully>

In [25]:
def get_inputs(sentence, tokenizer, vocab, max_seq_length):
    tokens = tokenizer(re.sub("[.,!?\\-]", '', sentence.lower()))
    input_ids = [vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']]
    n_pad = max_seq_length - len(input_ids)
    attention_mask = ([1] * len(input_ids)) + ([0] * n_pad)
    input_ids = input_ids + ([0] * n_pad)

    return {'input_ids': torch.LongTensor(input_ids).reshape(1, -1),
            'attention_mask': torch.LongTensor(attention_mask).reshape(1, -1)}

In [26]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = get_inputs(sentence_a, tokenizer, vocab, max_seq_length)
    inputs_b = get_inputs(sentence_b, tokenizer, vocab, max_seq_length)
    

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids'].to(device)
    attention_a = inputs_a['attention_mask'].to(device)
    inputs_ids_b = inputs_b['input_ids'].to(device)
    attention_b = inputs_b['attention_mask'].to(device)
    segment_ids = torch.zeros(1, max_seq_length, dtype=torch.int32).to(device)

    # Extract token embeddings from BERT
    u = model.get_last_hidden_state(inputs_ids_a, segment_ids)  # all token embeddings A = batch_size, seq_len, hidden_dim
    v = model.get_last_hidden_state(inputs_ids_b, segment_ids)  # all token embeddings B = batch_size, seq_len, hidden_dim

    # u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
    # v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

In [27]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9844


## 8. Comparison

In [28]:
from sentence_transformers import SentenceTransformer

# load example pre-trained sentence BERT from huggingface
hf_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

  return self.fget.__get__(instance, owner)()


### Try the Models on Different Sentences Pairs

In [29]:
# function to calculate similarity for the downloaded huggingface S-BERT
def calculate_similarity_hf(model, sentence_a, sentence_b):
    embeddings = model.encode([sentence_a, sentence_b])
    return cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0, 0]

Sentences with similar meaning

In [30]:
sentence_a = 'Machine learning is so hard. I am struggling so much'
sentence_b = "Machine learning is a difficult field, I don't think I will pass."

similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)
hf_similarity = calculate_similarity_hf(hf_model, sentence_a, sentence_b)

print(f"Cosine Similarity (Our S-BERT): {similarity:.4f}")
print(f"Cosine Similarity (all-MiniLM-L6-v2): {hf_similarity:.4f}")

Cosine Similarity (Our S-BERT): 0.9821
Cosine Similarity (all-MiniLM-L6-v2): 0.7759


Sentences with opposite meaning

In [31]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help at all with our students' education."

similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)
hf_similarity = calculate_similarity_hf(hf_model, sentence_a, sentence_b)

print(f"Cosine Similarity (Our S-BERT): {similarity:.4f}")
print(f"Cosine Similarity (all-MiniLM-L6-v2): {hf_similarity:.4f}")

Cosine Similarity (Our S-BERT): 0.9820
Cosine Similarity (all-MiniLM-L6-v2): 0.5476


Sentences that are completely irrelevant

In [32]:
sentence_a = "Today is a sunny day. Let's go outside."
sentence_b = "The Ukraine invasion of Russia is a controversial subject."

similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)
hf_similarity = calculate_similarity_hf(hf_model, sentence_a, sentence_b)

print(f"Cosine Similarity (Our S-BERT): {similarity:.4f}")
print(f"Cosine Similarity (all-MiniLM-L6-v2): {hf_similarity:.4f}")

Cosine Similarity (Our S-BERT): 0.9688
Cosine Similarity (all-MiniLM-L6-v2): 0.1033


### Spearman Correlations with True Label

In [39]:
# function to transform labels in our test set to make them correspond with cosine similarity
def transform_label(example):
    label_map = {0: 1,  # entailment sentences (label == 0) should have a cosine similarity of 1
                 1: 0,  # neutral sentences (label == 1) should have a cosine similarity of 0
                 2: -1  # contradiction sentences (label == 2) should have a cosine similarity of -1
                }
    
    example['label'] = label_map[example['label']]

    return example

In [66]:
from datasets import Dataset
import pandas as pd

test_set = raw_dataset['test']
test_set = Dataset.from_pandas(pd.DataFrame(map(transform_label, test_set)))

np.unique(test_set['label'])

array([-1,  0,  1])

In [51]:
result = []

for sample in tqdm(test_set):
    sentence_a = sample['premise']
    sentence_b = sample['hypothesis']
    label = sample['label']

    similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)
    hf_similarity = calculate_similarity_hf(hf_model, sentence_a, sentence_b)

    result.append(
        {'premise': sentence_a,
         'hypothesis': sentence_b,
         'similarity_our_sbert': similarity,
         'similarity_all-MiniLM-L6-v2': hf_similarity,
         'label': label}
    )

100%|██████████| 1000/1000 [00:22<00:00, 43.85it/s]


In [65]:
result = pd.DataFrame(result)
label_corr = result.corr(method='spearman', numeric_only=True)['label']

print("=== Spearman correlations with True Label ===")
print(f"Cosine Similarity (Our S-BERT): {label_corr['similarity_our_sbert']:.4f}")
print(f"Cosine Similarity (all-MiniLM-L6-v2): {label_corr['similarity_all-MiniLM-L6-v2']:.4f}")

=== Spearman correlations with True Label ===
Cosine Similarity (Our S-BERT): 0.0551
Cosine Similarity (all-MiniLM-L6-v2): 0.5989


#### Model Analysis

| **Model**               | **Training Loss** | **Validation Loss** | **Spearman Correlation with True Label** |
|-------------------------|:-----------------:|:-------------------:|:----------------------------------------:|
| **BERT from scratch**   |       4.261       |        5.100        |                     -                    |
| **S-BERT from scratch** |       1.126       |        1.123        |                  0.0551                  |
| **all-MiniLM-L6-v2**    |         -         |          -          |                  0.5989                  |

Our implemented S-BERT has a major problem in which all sentence pair are predicted to be very close to each other (cosine similarity close to 1) regardless of their actual semantic association or relevancy. This is in contrast with the huggingface model (all-MiniLM-L6-v2), which demonstrate much more accurate predictions. As such, we have identified the following areas that can be improved to increase our model's accuracy.

1. Embedding Size and Quality: \
Since our model only utilizes an embedding size of 768, we can increase this value further in order to allow the model to capture semantic and syntactic relationship in deeper detail. Additionally, we can also add context sentences to the input text to increase model linguistic learning capabilities.

2. Fine-tuning Hyperparameters: \
Adjusting hyperparameters of the sentence BERT model, such as learning rates, regularization techniques, or optimization algorithms, can help in reducing the issue of consistently predicting high cosine similarity values for unrelated sentences.

3. Dataset Quantity and Quality: \
Insufficient data can result in a biased model that fails to generalize well to various semantic relationships. Therefore, increasing the quantity and diversifying the training dataset can potentially improve our model's ability to discern subtle differences in meaning.

#### Key Challenges and Limitations

1. Computational Resources: \
Training a sentence BERT model, especially one with large number of parameters, requires substantial computational resources, including powerful GPUs or TPUs which are highly expensive and sometimes, are not sold commercially. Thus, instead of training from scratch, we can use cloud-based services to handle computational requirements or finetuning a pre-trained model.

2. Dataset Constraint: \
As previously mentioned, the performance of the model depends highly on dataset quantity and quality. However, due to computational and time constraints, it is not possible to train our S-BERT using the entirety of the combined SNLI and MNLI datasets which composed of nearly one-million samples since it will take days to finish training using a local machine which is not feasible.