# **Downloading the necessary packages and libraries**

In [None]:
!pip install transformers sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets, sentence-transformers
Successfully installed datasets-2.17.1 dill

# **Importing the necessary dependencies**

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import datetime
import random
import time
from tqdm import tqdm

# **Setting the device**

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print(f'There are {torch.cuda.device_count()} gpu(s) available.')
  print(f"We'll use {torch.cuda.get_device_name(0)} GPU.")
else:
  print("No GPU available, using CPU.")
  device = torch.device("cpu")

There are 1 gpu(s) available.
We'll use Tesla T4 GPU.


# **Loading and Previewing the dataset**

Loading the STSB (Semantic Textual Similarity ) dataset.

In [None]:
dataset = load_dataset("stsb_multi_mt", "en")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/470k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


We'll be using both the *train* and *dev* dataset for training purpose of our model.

In [None]:
print("A sample from STSB dataset's training split:")
print(dataset['train'][98])

A sample from STSB dataset's training split:
{'sentence1': 'A man is slicing potatoes.', 'sentence2': 'A woman is peeling potato.', 'similarity_score': 2.200000047683716}


# **Preparing the Dataset**

Loading the tokenizer we'll be using within our class **"Bert Base Uncased"**.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

We'll define data loader class **"STSBDataset"**.

In [None]:
class STSBDataset(torch.utils.data.Dataset):
  def __init__(self, dataset):
    similarity_scores = [i['similarity_score'] for i in dataset]
    self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
    self.first_sentences = [i['sentence1'] for i in dataset]
    self.second_sentences = [i['sentence2'] for i in dataset]
    self.concatenated_sentences = [[str(x), str(y)] for x,y in zip(self.first_sentences ,self.second_sentences)]

  def __len__(self):
    return len(self.concatenated_sentences)

  def get_batch_labels(self, idx):
    return torch.tensor(self.normalized_similarity_scores[idx])

  def get_batch_texts(self, idx):
    return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_labels = self.get_batch_labels(idx)

    return (batch_texts, batch_labels)

def collate_fn(texts):
  input_ids = texts['input_ids']
  attention_masks = texts['attention_mask']

  features = [{'input_ids': input_id, 'attention_mask' : attention_mask} for input_id, attention_mask in zip(input_ids, attention_masks)]

  return features

# **Defining the Model Class based on BERT**

Here, we'll create the actual model.

In [None]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [None]:
# Creating a model's instance
model = BertForSTS()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
)

# **Defining the Loss Function**

In [None]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

# **Preparing the Training & Validation Splits**

In [None]:
train_ds = STSBDataset(dataset['train'])
valid_ds = STSBDataset(dataset['dev'])

# Creating a 90:10 split
train_size = len(train_ds)
valid_size = len(valid_ds)

print("{:>5,} training samples".format(train_size))
print("{:>5,} validation samples".format(valid_size))

5,749 training samples
1,500 validation samples


In [None]:
batch_size = 8

train_dataloader = DataLoader(
    train_ds,
    num_workers = 4,
    batch_size = batch_size,
    shuffle = True    # Select samples from dataset randomly & put into the batches
)

validation_dataloader = DataLoader(
    valid_ds,
    num_workers = 4,
    batch_size = batch_size # Use same batch size
)



# **Defining the Optimizer and Scheduler**

In [None]:
#Using "AdamW" optimizer which is an extension of Adam optimizer
optimizer = AdamW(model.parameters(), lr=1e-6)

epochs = 8

# Total number of training steps = [num of batches] x [num of epochs]
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# **Training the Model**

In [None]:
def train():
  seed_val = 42

  criterion = CosineSimilarityLoss()
  criterion = criterion.to(device)

  random.seed(seed_val)
  torch.manual_seed(seed_val)

  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):

          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)

          train_data = collate_fn(train_data)
          model.zero_grad()

          output = [model(feature) for feature in train_data]

          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()


      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.5f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for val_data, val_label in tqdm(validation_dataloader):

          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)

          val_data = collate_fn(val_data)

          with torch.no_grad():
              output = [model(feature) for feature in val_data]

          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()

      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)

      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.5f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model, training_stats

In [None]:
# Launch the training
model, training_stats = train()


Training...


100%|██████████| 719/719 [05:19<00:00,  2.25it/s]



  Average training loss: 0.06452
  Training epoch took: 0:05:20

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.12it/s]


  Validation Loss: 0.04816
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:16<00:00,  2.27it/s]



  Average training loss: 0.03846
  Training epoch took: 0:05:17

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.11it/s]


  Validation Loss: 0.03979
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:17<00:00,  2.26it/s]



  Average training loss: 0.03305
  Training epoch took: 0:05:18

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.12it/s]


  Validation Loss: 0.03686
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:16<00:00,  2.28it/s]



  Average training loss: 0.02999
  Training epoch took: 0:05:16

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.12it/s]


  Validation Loss: 0.03538
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:16<00:00,  2.27it/s]



  Average training loss: 0.02857
  Training epoch took: 0:05:16

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.08it/s]


  Validation Loss: 0.03423
  Validation took: 0:00:27

Training...


100%|██████████| 719/719 [05:16<00:00,  2.27it/s]



  Average training loss: 0.02734
  Training epoch took: 0:05:17

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.08it/s]


  Validation Loss: 0.03385
  Validation took: 0:00:27

Training...


100%|██████████| 719/719 [05:17<00:00,  2.27it/s]



  Average training loss: 0.02651
  Training epoch took: 0:05:17

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.12it/s]


  Validation Loss: 0.03353
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:17<00:00,  2.27it/s]



  Average training loss: 0.02584
  Training epoch took: 0:05:17

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.12it/s]

  Validation Loss: 0.03345
  Validation took: 0:00:26

Training complete!
Total training took 0:45:49 (h:mm:ss)





In [None]:
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.064515,0.048159,0:05:20,0:00:26
2,0.038462,0.03979,0:05:17,0:00:26
3,0.03305,0.036862,0:05:18,0:00:26
4,0.029987,0.035378,0:05:16,0:00:26
5,0.028567,0.034227,0:05:16,0:00:27
6,0.027342,0.033851,0:05:17,0:00:27
7,0.026512,0.033526,0:05:17,0:00:26
8,0.025844,0.033447,0:05:17,0:00:26


In [None]:
test_dataset = load_dataset("stsb_multi_mt", name="en", split="test")

# Prepare the data
first_sent = [i['sentence1'] for i in test_dataset]
second_sent = [i['sentence2'] for i in test_dataset]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]


In [None]:
model.eval()


BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
)

In [None]:
def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']

  del test_input['token_type_ids']

  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()

  return sim

In [None]:
example_1 = full_text[100]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: A cat is walking around a house.
Sentence 2: A woman is peeling potato.
Predicted similarity score: -0.01


In [None]:
example_2 = full_text[130]
print(f"Sentence 1: {example_2[0]}")
print(f"Sentence 2: {example_2[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_2), 2)}")

Sentence 1: Two men are playing football.
Sentence 2: Two men are practicing football.
Predicted similarity score: 0.81
