In [9]:
import os
import sys
import yaml
import json
import tiktoken
import openai
import torch
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample

from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
import logging
from datetime import datetime
import os
import gzip
import csv

from datetime import datetime
import gzip
import os
import tarfile
import tqdm
import numpy as np
import wandb

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

root_path = '/home/ec2-user/sarang/wiki_cheat'

sys.path.insert(0, os.path.abspath(root_path))
os.chdir(root_path)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

## Load dataset

In [3]:
### Combined dataset
test_data_path = 'data/combined_test_data.json'
with open(test_data_path, 'r') as fp:
    combined_test_data = json.load(fp)

train_data_path = 'data/combined_train_data.json'
with open(train_data_path, 'r') as fp:
    combined_train_data = json.load(fp)

### Synth dataset
test_data_path = 'data/test_data_synth_nf.json'
with open(test_data_path, 'r') as fp:
    test_data_synth = json.load(fp)

train_data_path = 'data/train_data_synth_nf.json'
with open(train_data_path, 'r') as fp:
    train_data_synth = json.load(fp)

### Wikiqa dataset
test_data_path = 'data/test_data_wikiqa_nf.json'
with open(test_data_path, 'r') as fp:
    test_data_wikiqa = json.load(fp)

train_data_path = 'data/train_data_wikiqa_nf.json'
with open(train_data_path, 'r') as fp:
    train_data_wikiqa = json.load(fp)

In [4]:
combined_test_data[0]

{'query': 'HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US',
 'title': 'African immigration to the United States',
 'pos': 'As such, African immigrants are to be distinguished from African American people, the latter of whom are descendants of mostly West and Central Africans who were involuntarily brought to the United States by means of the historic Atlantic slave trade .',
 'negs': ['African immigration to the United States refers to immigrants to the United States who are or were nationals of Africa .',
  'The term African in the scope of this article refers to geographical or national origins rather than racial affiliation.',
  'From the Immigration and Nationality Act of 1965 to 2007, an estimated total of 0.8 to 0.9 million Africans immigrated to the United States, accounting for roughly 3.3% of total immigration to the United States during this period.',
  'African immigrants in the United States come from almost all regions in Africa and do not constitute a homogeneous group.'

### Quickly check how many samples are exceeding the max token limit. Only 1, so not a lot. Do not need to chunk the wiki passages further. 

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import math
from sentence_transformers import SentenceTransformer, SentencesDataset, losses

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

input_examples = []
for data in combined_train_data:
    input_examples.append(InputExample(texts=[data['query'], data['pos']]))
len(input_examples)

max_toks = 0
cnt = 0

for inp in input_examples:
    text = inp.texts[1]
    tokens = tokenizer([text], padding=True)
    if len(tokens['input_ids'][0]) > 512:
        cnt+=1
    max_toks = max(max_toks, len(tokens['input_ids'][0]))

max_toks, cnt

Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors


(545, 1)

## Train code

In [13]:
def train(train_config, test_queries, test_passages,test_labels):
    train_config['model_save_path'] = 'train_reranker/models/'+train_config['base_model'].replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    model = CrossEncoder(train_config['base_model'],
                         train_config['num_labels'], 
                         max_length=train_config['max_length'],
                         device=train_config['device'])
    
    train_dataloader = DataLoader(input_examples, shuffle=True, batch_size=train_config['train_batch_size'])
    evaluator = CEBinaryClassificationEvaluator([ [q,p] for q,p in zip(test_queries, test_passages)], test_labels)
    
    # Train the model
    model.fit(train_dataloader=train_dataloader,
              evaluator=evaluator,
              epochs=train_config['epochs'],
              warmup_steps=train_config['warmup_steps'],
              output_path=train_config['model_save_path'],
              use_amp=True,
              show_progress_bar=True
    )


## Train Data

In [14]:
input_examples = []
for data in combined_train_data:
    input_examples.append(InputExample(texts=[data['query'], data['pos']], label=1))
    for neg in data['negs']:
        input_examples.append(InputExample(texts=[data['query'], neg], label=0))

len(input_examples)

58366

##  Eval Data

In [15]:
test_queries = []
test_passages = []
test_labels = []
for data in combined_test_data:
    test_queries.append(data['query'])
    test_passages.append(data['pos'])
    test_labels.append(1)

    for neg in data['negs']:
        test_queries.append(data['query'])
        test_passages.append(neg)
        test_labels.append(0)

In [14]:
len(test_passages)

24093

## Reranker Experiments
### Experiments with various batch size with miniLM-L-6 and mini-LM-l-12
### Batch size 128 with miniLM-L-6 throws and 64 with miniLM-L-12 throws OOM 

In [10]:
%%time

base_model_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'

train_config ={
        "base_model": base_model_name,
        "epochs": 5,
        "train_batch_size" : 32,
        "warmup_steps": 50,
        "evaluation_steps" : 2000,
        "num_labels": 1,
        "max_length": 512,
        "device": 'cuda',
        "evaluator_name":'eval_on_test_data',
        "train": True
}

train(train_config, test_queries, test_passages,test_labels)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1824 [00:00<?, ?it/s]

2024-01-28 19:26:03 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset after epoch 0:
2024-01-28 19:26:37 - Accuracy:           98.12	(Threshold: 1.7351)
2024-01-28 19:26:37 - F1:                 89.53	(Threshold: 1.0694)
2024-01-28 19:26:37 - Precision:          90.00
2024-01-28 19:26:37 - Recall:             89.06
2024-01-28 19:26:37 - Average Precision:  95.41

2024-01-28 19:26:37 - Save model to train_reranker/models/cross-encoder-ms-marco-MiniLM-L-12-v2-2024-01-28_19-22-20


Iteration:   0%|          | 0/1824 [00:00<?, ?it/s]

2024-01-28 19:30:18 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset after epoch 1:
2024-01-28 19:30:52 - Accuracy:           98.18	(Threshold: 2.2896)
2024-01-28 19:30:52 - F1:                 89.82	(Threshold: 1.3535)
2024-01-28 19:30:52 - Precision:          88.99
2024-01-28 19:30:52 - Recall:             90.66
2024-01-28 19:30:52 - Average Precision:  95.36



Iteration:   0%|          | 0/1824 [00:00<?, ?it/s]

2024-01-28 19:34:33 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset after epoch 2:
2024-01-28 19:35:07 - Accuracy:           98.14	(Threshold: 2.7202)
2024-01-28 19:35:07 - F1:                 89.68	(Threshold: 2.7044)
2024-01-28 19:35:07 - Precision:          90.45
2024-01-28 19:35:07 - Recall:             88.92
2024-01-28 19:35:07 - Average Precision:  95.39



Iteration:   0%|          | 0/1824 [00:00<?, ?it/s]

2024-01-28 19:38:47 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset after epoch 3:
2024-01-28 19:39:21 - Accuracy:           98.11	(Threshold: 3.7485)
2024-01-28 19:39:21 - F1:                 89.62	(Threshold: 2.7342)
2024-01-28 19:39:21 - Precision:          88.95
2024-01-28 19:39:21 - Recall:             90.29
2024-01-28 19:39:21 - Average Precision:  95.30



Iteration:   0%|          | 0/1824 [00:00<?, ?it/s]

2024-01-28 19:43:01 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset after epoch 4:
2024-01-28 19:43:35 - Accuracy:           98.07	(Threshold: 4.6845)
2024-01-28 19:43:35 - F1:                 89.34	(Threshold: 2.8917)
2024-01-28 19:43:35 - Precision:          88.15
2024-01-28 19:43:35 - Recall:             90.57
2024-01-28 19:43:35 - Average Precision:  95.15

CPU times: user 24min 9s, sys: 1min 7s, total: 25min 17s
Wall time: 21min 15s


## Evaluation ( Eval on the combined test dataset )

### Baseline original model 

In [47]:
base_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
model = CrossEncoder(base_model_name, device='cuda', num_labels=1)
evaluator = CEBinaryClassificationEvaluator([ [q,p] for q,p in zip(test_queries, test_passages)], test_labels)
scores = evaluator(model)

2024-01-28 18:51:33 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset:
2024-01-28 18:51:51 - Accuracy:           96.50	(Threshold: 5.9702)
2024-01-28 18:51:51 - F1:                 80.50	(Threshold: 5.7280)
2024-01-28 18:51:51 - Precision:          80.59
2024-01-28 18:51:51 - Recall:             80.40
2024-01-28 18:51:51 - Average Precision:  86.26



### train on combined dataset , cross-encoder-ms-marco-MiniLM-L-12 with batch size 32

In [11]:
base_model_name = 'train_reranker/models/cross-encoder-ms-marco-MiniLM-L-12-v2-2024-01-28_19-22-20'
model = CrossEncoder(base_model_name, device='cuda', num_labels=1)
evaluator = CEBinaryClassificationEvaluator([ [q,p] for q,p in zip(test_queries, test_passages)], test_labels)
scores = evaluator(model)

2024-01-28 19:45:36 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset:
2024-01-28 19:46:10 - Accuracy:           98.12	(Threshold: 1.7351)
2024-01-28 19:46:10 - F1:                 89.53	(Threshold: 1.0694)
2024-01-28 19:46:10 - Precision:          90.00
2024-01-28 19:46:10 - Recall:             89.06
2024-01-28 19:46:10 - Average Precision:  95.41



### train on combined dataset , cross-encoder-ms-marco-MiniLM-L-6 with batch size 64

In [12]:
base_model_name = 'train_reranker/models/cross-encoder-ms-marco-MiniLM-L-6-v2-2024-01-28_18-52-53'
model = CrossEncoder(base_model_name, device='cuda', num_labels=1)
evaluator = CEBinaryClassificationEvaluator([ [q,p] for q,p in zip(test_queries, test_passages)], test_labels)
scores = evaluator(model)

2024-01-28 19:46:31 - CEBinaryClassificationEvaluator: Evaluating the model on  dataset:
2024-01-28 19:46:49 - Accuracy:           98.09	(Threshold: 2.0311)
2024-01-28 19:46:49 - F1:                 89.44	(Threshold: 1.3753)
2024-01-28 19:46:49 - Precision:          88.88
2024-01-28 19:46:49 - Recall:             90.02
2024-01-28 19:46:49 - Average Precision:  95.04

