# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [70]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pickle

# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device

In [71]:
# Set device MPS or Cuda
# device = torch.device("mps" if torch.backends.mps.is_available() else "cuda")
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [72]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

# mount with os
import os
os.chdir('/content/drive/My Drive/_NLP/A4/NLP-A4-Do-You-Agree')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Data

### Train, Test, Validation

In [73]:
!pip install datasets



In [74]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [75]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [76]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [77]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [78]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [79]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [80]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [81]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## 2. Preprocessing

In [82]:
# import custom modules
from app.helpers.classes import BERT, SimpleTokenizer

In [83]:
data = pickle.load(open('./app/models/bert-pretrained-data.pkl', 'rb'))
word2id = data['word2id']
max_len = data['max_len']
max_mask = data['max_mask']

In [84]:
#
tokenizer = SimpleTokenizer(word2id)

In [85]:
def preprocess_function(examples):
    # Tokenize the premise
    premise_result = tokenizer.encode(
        examples['premise'])
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer.encode(
        examples['hypothesis'])
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

In [86]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

## 3. Data loader

In [87]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=batch_size,
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'],
    batch_size=batch_size
)

In [88]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 1000])
torch.Size([32, 1000])
torch.Size([32, 1000])
torch.Size([32, 1000])
torch.Size([32])


## 4. Model

In [89]:
#from app.helpers.classes import BERT, Embedding  # Now the import should work

In [90]:
# save_path = f'./app/models/bert-pretrained-model.pt'
# model = BERT()
# model.load_state_dict(torch.load(save_path))
# model.to(device)

save_path = f'./app/models/bert-pretrained-model.pt'
model = BERT()

# Load the state dictionary with strict=False
state_dict = torch.load(save_path)
model.load_state_dict(state_dict, strict=False)

model.to(device)

# checkpoint = torch.load(save_path, map_location="cpu")
# print("Keys in checkpoint:", checkpoint.keys())

  state_dict = torch.load(save_path)


BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(23069, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x MultiHeadAttention(
      (W_Q): Linear(in_features=768, out_features=512, bias=True)
      (W_K): Linear(in_features=768, out_features=512, bias=True)
      (W_V): Linear(in_features=768, out_features=512, bias=True)
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (decoder): Linear(in_features=768, out_features=23069, bias=False)
)

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [91]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function.
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

In [92]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim

    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [93]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [94]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()

In [95]:
tokenizer = SimpleTokenizer({'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, '[UNK]': 4})
output = tokenizer.encode(["I love you", "I hate you"])
print(output)

{'input_ids': [tensor([4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0,

In [96]:
# import torch.nn as nn

# vocab_size = len(tokenizer.word2id)  # Ensure correct size
# embedding_dim = 128
# embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=tokenizer.word2id['[PAD]'])

In [97]:
# vocab_size = len(tokenizer.word2id)  # Ensure correct vocab size

# print("Inputs (Before Model Call):")
# print(f"Input IDs: {inputs_ids_a}")
# print(f"Max ID in Inputs: {inputs_ids_a.max()}, Min ID: {inputs_ids_a.min()}")
# print(f"Vocab Size: {vocab_size}")

# # Check if any value is out of bounds
# if inputs_ids_a.max() >= vocab_size or inputs_ids_a.min() < 0:
#     raise ValueError("Error: Token index out of bounds!")

# # Run the model
# u, _, _ = model(inputs_ids_a, segment_ids, masked_pos)


## 6. Training

In [98]:
from tqdm.auto import tqdm

num_epoch = 1

# Reduced batch size
batch_size = 6
max_len = 1000

# Recreate dataloaders with the new batch size
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=batch_size,
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'],
    batch_size=batch_size
)

# need segment and masked for model input but not used in SBERT
# Create segment_ids tensor with shape (batch_size, max_len)
#segment_ids = torch.tensor([0] * max_len).unsqueeze(0).repeat(batch_size, 1).to(device)

# Create masked_pos tensor with shape (batch_size, max_mask)
#masked_pos = torch.tensor([0] * max_mask).unsqueeze(0).repeat(batch_size, 1).to(device)

accuracy = 0
count = 0
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()

        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)

        # Truncate input sequences to the new max length
        inputs_ids_a = inputs_ids_a[:, :max_len]
        inputs_ids_b = inputs_ids_b[:, :max_len]
        attention_a = attention_a[:, :max_len]
        attention_b = attention_b[:, :max_len]

        current_batch_size = inputs_ids_a.shape[0]
        # Create segment_ids tensor with shape (batch_size, max_len) for the current batch
        segment_ids = torch.tensor([0] * max_len).unsqueeze(0).repeat(current_batch_size, 1).to(device)

        # Create masked_pos tensor with shape (batch_size, max_mask) for the current batch
        masked_pos = torch.tensor([0] * max_mask).unsqueeze(0).repeat(current_batch_size, 1).to(device)

        # extract token embeddings from BERT at last_hidden_state
        u, _, _ = model(inputs_ids_a, segment_ids[:, :max_len], masked_pos)
        v, _, _ = model(inputs_ids_b, segment_ids[:, :max_len], masked_pos)

        u_last_hidden_state = u # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim

        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim

        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim

        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        for out, lab in zip(x, label):
            count = count + 1
            if torch.argmax(out).item() == lab.item():
                accuracy = accuracy + 1
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)

        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()

    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f} | Accuracy = {(accuracy / count) * 100}%')

  0%|          | 0/167 [00:00<?, ?it/s]

Epoch: 1 | loss = 1.102627 | Accuracy = 31.7%


In [99]:
model.eval()
classifier_head.eval()

total_similarity = 0
num_batches = len(eval_dataloader)

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # Move all batch tensors to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)[:, :max_len]  # Truncate to max_len
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)[:, :max_len]
        attention_a = batch['premise_attention_mask'].to(device)[:, :max_len]
        attention_b = batch['hypothesis_attention_mask'].to(device)[:, :max_len]
        labels = batch['labels'].to(device)

        # Get the current batch size
        batch_size = inputs_ids_a.shape[0]

        # Create segment_ids and masked_pos tensors with correct batch shape
        segment_ids = torch.zeros((batch_size, max_len), dtype=torch.long, device=device)
        masked_pos = torch.zeros((batch_size, max_mask), dtype=torch.long, device=device)

        # Extract token embeddings from BERT
        u, _, _ = model(inputs_ids_a, segment_ids, masked_pos)  # (batch_size, seq_len, hidden_dim)
        v, _, _ = model(inputs_ids_b, segment_ids, masked_pos)

        # Mean pooling for sentence embeddings
        u_mean = mean_pool(u, attention_a).detach().cpu().numpy()  # (batch_size, hidden_dim)
        v_mean = mean_pool(v, attention_b).detach().cpu().numpy()

        # Compute cosine similarity for each sentence pair in the batch
        similarity_scores = [cosine_similarity(u_mean[i], v_mean[i]) for i in range(batch_size)]

        # Compute average similarity for the batch
        batch_similarity = np.mean(similarity_scores)
        total_similarity += batch_similarity

# Compute the final average similarity score
average_similarity = total_similarity / num_batches
print(f"Average Cosine Similarity: {average_similarity:.4f}")


Average Cosine Similarity: 0.6256


In [100]:
save_path = f'./app/models/sentence_model.pt'
torch.save(model.state_dict(), save_path)

## 7. Inference

In [101]:
save_path = f'./app/models/sentence_model.pt'
model = BERT()
model.load_state_dict(torch.load(save_path))
model.to(device)

  model.load_state_dict(torch.load(save_path))


BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(23069, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x MultiHeadAttention(
      (W_Q): Linear(in_features=768, out_features=512, bias=True)
      (W_K): Linear(in_features=768, out_features=512, bias=True)
      (W_V): Linear(in_features=768, out_features=512, bias=True)
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (decoder): Linear(in_features=768, out_features=23069, bias=False)
)

In [102]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def mean_pooling(model_output, attention_mask):
    """Perform mean pooling to get sentence embeddings"""
    token_embeddings = model_output.last_hidden_state  # (batch_size, seq_len, hidden_dim)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / input_mask_expanded.sum(1)

def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize sentences with padding & truncation
    inputs_a = tokenizer(sentence_a, return_tensors='pt', truncation=True, padding='max_length', max_length=128).to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', truncation=True, padding='max_length', max_length=128).to(device)

    # Extract token embeddings from BERT
    with torch.no_grad():  # Disable gradient calculations
        output_a = model(**inputs_a)
        output_b = model(**inputs_b)

    # Apply mean pooling
    u = mean_pooling(output_a, inputs_a['attention_mask']).detach().cpu().numpy().reshape(-1)
    v = mean_pooling(output_b, inputs_b['attention_mask']).detach().cpu().numpy().reshape(-1)

    # Compute cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained BERT model & tokenizer (e.g., BERT-base-uncased from Hugging Face)
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)

# Define sentences
sentence_a = "Your contribution helped make it possible for us to provide our students with a quality education."
sentence_b = "Your contributions were of no help with our students' education."

# Compute similarity
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")


Cosine Similarity: 0.8057
