In [None]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
pip install --upgrade nlpaug




In [None]:
!pip install sentence_transformers nlpaug seaborn datasets accelerate



In [None]:
# import Necessary Library

import os
import gzip
import csv
import math
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nlpaug.augmenter.word as naw  # a library for data augmentation specifically ContextualWordEmbsAug used for synonym replacement with BERT

from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, losses, InputExample, SentencesDataset, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset


sns.set(style='whitegrid')

In [None]:
# Configuring the environment and model

model_name = 'bert-base-uncased'
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32
num_epochs = 5

print(device)

cuda


In [None]:
# Dataset paths and save path

data_path = 'stsbenchmark.tsv.gz'
save_path = 'models/scenario1_model'

# Download the dataset
util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', data_path)

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [None]:
# Converting the dataset into training and validation

train, validation = [], []

with gzip.open(data_path, 'rt', encoding='utf8') as f:
  reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

  for row in reader:

    # Extract sentences and score
    x = [row['sentence1'], row['sentence2']]
    y = float(row['score']) / 5.0

    # Create InputExample instance
    sample = InputExample(texts=x , label=y)

    # Split into train and validation sets
    if row['split'] == 'dev':
      validation.append(sample)
    elif row['split'] == 'train':
      train.append(sample)

In [None]:
# Synonyms replacement with using BERT

aug = naw.ContextualWordEmbsAug(model_path=model_name, action="insert", device=device)

augmented = []

# Augment training sample with synonym replacement
for sample in tqdm(train, unit="docs"):

  # Augment sentences (sentence1 and sentence2)
  augmented_texts = aug.augment(sample.texts)
  inp_example = InputExample(texts=augmented_texts, label=sample.label)
  augmented.append(inp_example)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
 34%|███▎      | 1937/5749 [01:13<02:26, 26.04docs/s]

In [None]:
# Load the pre-trained BERT model for token embedding
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed-size sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True)

# Combine the transformer and pooling into a SentenceTransformer model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Here we created Bi-Encoder using the SentenceTransformer framework, which combine the transformer model and pooling mechanism, the model maps sentence to a fixed size vector

In [None]:
# Combine the train and augmented samples

train_dataset = SentencesDataset(train + augmented, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

# use cosin similarity loss for training, which ensure that the embedding for similar sentence pairs close to each other in vector space
train_loss = losses.CosineSimilarityLoss(model=model)

# set the evaluator to monitor progress on the  validation set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation, name='sts-dev')

In [None]:
# Disable W&B logging by setting the environment variable
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# Train the Model ( Train + augmented )
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          output_path=save_path)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
360,No log,No log,0.713461,0.708233
720,0.090400,No log,0.807231,0.804399
1000,0.041700,No log,0.827662,0.825717
1080,0.041700,No log,0.83035,0.828412
1440,0.041700,No log,0.841383,0.840855
1800,0.031200,No log,0.849491,0.850244


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# Train te model with only train dataset ( without augmented )

train_dataset_ = SentencesDataset(train, model)
train_dataloader_ = DataLoader(train_dataset_, shuffle=True, batch_size=batch_size)

# use cosin similarity loss for training, which ensure that the embedding for similar sentence pairs close to each other in vector space
train_loss = losses.CosineSimilarityLoss(model=model)

# set the evaluator to monitor progress on the  validation set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation, name='sts-dev')

In [None]:
# Train the Model ( Train dataset only )
model.fit(train_objectives=[(train_dataloader_, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          output_path=save_path)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
180,No log,No log,0.858304,0.857576
360,No log,No log,0.858921,0.858383
540,0.016200,No log,0.860315,0.859921
720,0.016200,No log,0.861581,0.860809
900,0.016200,No log,0.862248,0.861778
