In [None]:
import json 
import pandas as pd
import torch
from datetime import datetime
from sentence_transformers import SentenceTransformer, evaluation, losses, InputExample, datasets
from sentence_transformers import util as sentenceutils
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# path to eli5 or wizard of wikipedia 
data_folder = usr_path+ '/bi-encoder/eli5/splits/' 

### Read Train Samples

In [None]:
# read training data
train_pairs = pd.read_csv(data_folder + 'train_pairs.csv')

In [None]:
# format training pairs for data loader
# Multiple Negatives Ranking Loss requires input pairs [query, relevant_passage]

train_questions = train_pairs['input'].tolist()
train_passages = train_pairs['passages_text'].tolist()

pairs = [list(i) for i in zip(train_questions, train_passages)]

train_samples = []
for p in range(0, len(pairs)):
    train_samples.append(InputExample(texts=pairs[p]))

### Create DataLoader

In [None]:
# load base model 
model = 'msmarco-distilbert-base-tas-b'
bi_encoder = SentenceTransformer(model) 

In [None]:
train_batch_size = 16
num_epochs = 3

In [None]:
# Multiple Negatives Ranking Loss requires no duplicate passages or queries, 
# so no duplicates data loader is used

train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder, scale=1, similarity_fct=sentenceutils.dot_score)

### Set up Evaluator

In [None]:
# set up information retrieval evaluator for evaluation during training

test_passages = pd.read_csv(data_folder + 'test_passages.csv', converters={'relevant_ids': pd.eval})
test_corpus = pd.read_csv(data_folder + 'test_corpus.csv')

passages = dict(zip(test_passages['id'], test_passages['passages_text']))

test_passages['relevant_ids'] = test_passages['relevant_ids'].apply(set)
relevant_docs = dict(zip(test_passages['id'], test_passages['relevant_ids']))

corpus = dict(zip(test_corpus['id'], test_corpus['input']))
    
ir_evaluator = evaluation.InformationRetrievalEvaluator(passages, corpus, relevant_docs)

### Tune Model

In [None]:
output_folder = usr_path+ '/bi-encoder/eli5/tuned_models/' 

In [None]:
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of training data

In [None]:
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
               evaluator=ir_evaluator,
               epochs=num_epochs,
               warmup_steps=warmup_steps, 
               show_progress_bar=True,
               output_path=output_folder+model+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 
              )

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3718 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3718 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3718 [00:00<?, ?it/s]