### Step 0: Import and prepair data

In [None]:
%%capture
!pip install nlpaug 

In [None]:
%%capture
!pip install -U sentence-transformers

In [None]:
from torch.utils.data import DataLoader
import torch
import math
from zipfile import ZipFile

from sentence_transformers import SentenceTransformer,  SentencesDataset, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader, InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

import nlpaug.augmenter.word as naw
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
data = pd.read_excel("/content/drive/MyDrive/dataset_labeled.xlsx")
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
###### Configuration ######
model_name = 'bert-base-uncased' #You can specify any huggingface/transformers pre-trained model here
device = "cuda" if torch.cuda.is_available() else "cpu"
top_k = 3 
batch_size = 8
num_epochs = 10
max_seq_length = 512
cross_encoder_path = 'output/cross-encoder/stsb_indomain_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
bi_encoder_path = 'output/bi-encoder/stsb_augsbert_SS_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
###### Cross-encoder ######
cross_encoder = CrossEncoder(model_name, num_labels=1)

###### Bi-encoder ######
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Step 1: Train cross-encoder model

In [None]:
# del gold_samples, dev_samples, test_samples
gold_samples = []
dev_samples = []
test_samples = []
data['Score'] = data['Score'] / 5.0


data_train = data.loc[lambda data: data['split'] == 'train'][['sentences 1', 'sentences 2', 'Score']]
data_test = data.loc[lambda data: data['split'] == 'test'][['sentences 1', 'sentences 2', 'Score']]
data_dev = data.loc[lambda data: data['split'] == 'dev'][['sentences 1', 'sentences 2', 'Score']]

data_train.apply(lambda x: gold_samples.append(InputExample(texts=[x['sentences 1'], x['sentences 2']], 
                                                            label=x['Score'])), axis=1)
data_train.apply(lambda x: gold_samples.append(InputExample(texts=[x['sentences 2'], x['sentences 1']], 
                                                            label=x['Score'])), axis=1)

data_test.apply(lambda x: test_samples.append(InputExample(texts=[x['sentences 1'], x['sentences 2']], 
                                                            label=x['Score'])), axis=1)
data_dev.apply(lambda x: dev_samples.append(InputExample(texts=[x['sentences 1'], x['sentences 2']], 
                                                            label=x['Score'])), axis=1);

In [None]:
# !kill -9 -1

In [None]:
# Wrap gold_samples into a pytorch DataLoader
train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=batch_size)

# Add an evaluator during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='HR-Akatsuki-dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

# Train the cross-encoder model
cross_encoder.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=cross_encoder_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1625.0, style=ProgressStyle(description_w…

### Step 2: Prepare silver pairs to label

In [None]:
##### Step 2: Prepare silver pairs to label ##### 
silver_data = []
sentences = set()

for sample in gold_samples:
    sentences.update(sample.texts)

sentences = list(sentences) # unique sentences
sent2idx = {sentence: idx for idx, sentence in enumerate(sentences)} # storing id and sentence in dictionary
duplicates = set((sent2idx[data.texts[0]], sent2idx[data.texts[1]]) for data in gold_samples) # not to include gold pairs of sentences again

# For simplicity we use a pretrained model
semantic_model_name = 'bert-base-nli-stsb-mean-tokens'
semantic_search_model = SentenceTransformer(semantic_model_name)

# encoding all unique sentences present in the training dataset
embeddings = semantic_search_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True)

# retrieving top-k sentences given a sentence from the dataset
for idx in tqdm(range(len(sentences)),unit="docs"):
    sentence_embedding = embeddings[idx]
    cos_scores = util.pytorch_cos_sim(sentence_embedding, embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_results = torch.topk(cos_scores, k=top_k+1)
    for score, iid in zip(top_results[0], top_results[1]):
        if iid != idx and (iid, idx) not in duplicates:
            silver_data.append((sentences[idx], sentences[iid]))
            duplicates.add((idx,iid))

cross_encoder = CrossEncoder(cross_encoder_path)
silver_scores = cross_encoder.predict(silver_data)

### Step 3: Train bi-encoder model

In [None]:
#### Step 3: Train bi-encoder model ####

# Convert the dataset to a DataLoader ready for training
silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \
    data, score in zip(silver_data, silver_scores))

train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=bi_encoder)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='HR-Akatsuki-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=bi_encoder_path
          )

In [None]:
#### Evaluate SBERT performance on STS benchmark test dataset ####
bi_encoder = SentenceTransformer(bi_encoder_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='HR-Akatsuki-dev')
test_evaluator(bi_encoder, output_path=bi_encoder_path)

#### Visuzlize the performance of training model on STS benchmark train, dev, test dataset ####
train_dev_result = pd.read_csv(bi_encoder_path + '/similarity_evaluation_HR-Akatsuki-dev_results.csv').iloc[-2:,:]
test_result = pd.read_csv(bi_encoder_path + '/similarity_evaluation_HR-Akatsuki-dev_results.csv')

result = pd.concat([train_dev_result, test_result]).iloc[:,2:]
result['name'] = ["train", "dev", "test"]
plot_result = pd.melt(result, id_vars=['name'])

plt.figure(figsize = (12,6))
sns.barplot(x="value", y="variable", hue="name", data=plot_result, palette="Blues_d");