In [None]:
# In this notebook the code for fine-tuning the bi-encoder is provided

In [None]:
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
from datetime import datetime
from zipfile import ZipFile
import logging
import csv
import sys
import torch
import math
import gzip
import os

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout



## Read and prepare data sets

In [None]:
dataset_path = '' # specify path to training, dev, and test data sets  

######### Read data and convert it to PyTorch dataloader ##########

logging.info("Reading {} train dataset".format('scored_pairs'))

# Read train data
train_samples = []
with open(os.path.join(dataset_path, "train.csv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['label1'], row['label2']], label=float(row['score']))
        train_samples.append(sample)


# Read dev data
dev_samples = []
with open(os.path.join(dataset_path, "dev.csv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['label1'], row['label2']], label=float(row['score']))
        dev_samples.append(sample)
        
# Read test data
test_samples = []
with open(os.path.join(dataset_path, "test.csv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['label1'], row['label2']], label=float(row['score']))
        test_samples.append(sample)

# Convert the dataset to DataLoaders ready for training
logging.info("Initializing Dataloaders for training")

batch_size = 16
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)


## Specifying the model and save paths

In [None]:
#Selecting pre-trained model 'bert-base-uncased' from SentenceTransformers library
model_name = 'bert-base-uncased' 
num_epochs = 5
max_seq_length = 128
use_cuda = torch.cuda.is_available()

trained_model_save_path = 'output/'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


In [None]:
###### Model architecture ######

logging.info("Loading model: {}".format(model_name))

# Use transformers model BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

# Compiling the model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
print(model)

## Development of Evaluator

In [None]:
###### Specifing loss function and evaluator ######

train_loss = losses.CosineSimilarityLoss(model=model)
  
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples)     

## Configure training

In [None]:
# Configure the training.
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=trained_model_save_path
          )

## Evaluating the model on test data set

In [None]:
# Loading the fine-tuned model
trained_model = SentenceTransformer(trained_model_save_path)

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples)
trained_model.evaluate(test_evaluator)