### Homework 5: Question search engine

Remeber week01 where you used GloVe embeddings to find related questions? That was.. cute, but far from state of the art. It's time to really solve this task using context-aware embeddings.

__Warning:__ this task assumes you have seen `seminar.ipynb`!

In [1]:
%pip install --upgrade transformers datasets accelerate deepspeed
!pip install -U nvidia-ml-py3
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from tqdm import tqdm
import torch.cuda.amp as amp
import time
import os
import pandas as pd

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deepspeed
  Downloading deepspeed-0.12.2.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K    

### Load data and model

In [2]:
qqp = datasets.load_dataset('SetFit/qqp')
print('\n')
print("Sample[0]:", qqp['train'][0])
print("Sample[3]:", qqp['train'][3])

Downloading readme:   0%|          | 0.00/313 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/70.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Sample[0]: {'text1': 'How is the life of a math student? Could you describe your own experiences?', 'text2': 'Which level of prepration is enough for the exam jlpt5?', 'label': 0, 'idx': 0, 'label_text': 'not duplicate'}
Sample[3]: {'text1': 'What can one do after MBBS?', 'text2': 'What do i do after my MBBS ?', 'label': 1, 'idx': 3, 'label_text': 'duplicate'}


In [None]:
model_name = "gchhablani/bert-base-cased-finetuned-qqp"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

### Tokenize the data

In [None]:
MAX_LENGTH = 128
def preprocess_function(examples):
    result = tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

qqp_preprocessed = qqp.map(preprocess_function, batched=True)

In [None]:
print(repr(qqp_preprocessed['train'][0]['input_ids'])[:100], "...")

[101, 1731, 1110, 1103, 1297, 1104, 170, 12523, 2377, 136, 7426, 1128, 5594, 1240, 1319, 5758, 136,  ...


### Task 1: evaluation (1 points)

We randomly chose a model trained on QQP - but is it any good?

One way to measure this is with validation accuracy - which is what you will implement next.

Here's the interface to help you do that:

##### Prepare

In [None]:
val_set = qqp_preprocessed['validation']
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=1, shuffle=False, collate_fn=transformers.default_data_collator
)
print('val_set.shape', val_set.shape)

val_set.shape (40430, 8)


##### Simple train

In [None]:
def train_model(model, train_dataset=qqp_preprocessed['validation'], batch_size=50, opt_lrate=1e-3, num_epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,  # Larger batch size
        collate_fn=transformers.default_data_collator,
        shuffle=False,
        num_workers=2,  # Utilize multiple workers for data loading
        pin_memory=True
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=opt_lrate)
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(qqp_preprocessed['train']) // batch_size  # Adjust the batch size if needed
    )

    # Define the loss function
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            with torch.set_grad_enabled(True):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['labels'].to(device)
                predicted = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids)

                loss = criterion(predicted.logits, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()

        running_loss += loss.item()

    # Print the average loss for each epoch
    print(f"Epoch {epoch + 1} - Loss: {running_loss / len(train_loader)}")

In [None]:
for batch in val_loader:
     break  # here be your training code
print("Sample batch:", batch)

with torch.no_grad():
  predicted = model(
      input_ids=batch['input_ids'],
      attention_mask=batch['attention_mask'],
      token_type_ids=batch['token_type_ids']
  )

print('\nPrediction (probs):', torch.softmax(predicted.logits, dim=1).data.numpy())

Sample batch: {'labels': tensor([0]), 'idx': tensor([0]), 'input_ids': tensor([[  101,  2009,  1132,  2170,   118,  4038,  1177,  2712,   136,   102,
          2009,  1132,  1117, 10224,  4724,  1177,  2712,   136,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,   

##### Measure accuracy

__Your task__ is to measure the validation accuracy of your model.
Doing so naively may take several hours. Please make sure you use the following optimizations:

- run the model on GPU with no_grad
- using batch size larger than 1
- use optimize data loader with num_workers > 1
- (optional) use [mixed precision](https://pytorch.org/docs/stable/notes/amp_examples.html)


In [None]:
def val_accuracy(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    all_preds = []
    all_labels = []

    start_time = time.time()

    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            predicted = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

            all_preds.extend(torch.argmax(predicted.logits, axis=1).tolist())
            all_labels.extend(labels.tolist())

    end_time = time.time()

    # Calculate speed (samples per second)
    num_samples = len(all_preds)
    speed = num_samples / (end_time - start_time)

    # Accuracy
    correct_preds = sum([1 for pred, label in zip(all_preds, all_labels) if pred == label])
    validation_accuracy = correct_preds / len(all_labels)

    # Size of model
    model_size = torch.save(model.state_dict(), "model.pt")
    model_size = os.path.getsize("model.pt") >> 20
    os.remove("model.pt")

    return validation_accuracy, speed, model_size, model.num_parameters()

In [None]:
# Define the data loader with optimized settings
batch_size = 50
val_loader = DataLoader(
    qqp_preprocessed['validation'],
    batch_size=batch_size,
    collate_fn=transformers.default_data_collator,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

validation_accuracy, spd, mb, num_par = val_accuracy(model, val_loader)
accuracy = validation_accuracy
print(f"/nValidation Accuracy: {validation_accuracy}")

100%|██████████| 809/809 [01:16<00:00, 10.58it/s]

/nValidation Accuracy: 0.9083601286173634





In [None]:
assert 0.9 < accuracy < 0.91

Example of simple trainnig(1 epoch)

### Task 2: train the model (5 points)

For this task, you have two options:

__Option A:__ fine-tune your own model. You are free to choose any model __except for the original BERT.__ We recommend [DeBERTa-v3](https://huggingface.co/microsoft/deberta-v3-base). Better yet, choose the best model based on public benchmarks (e.g. [GLUE](https://gluebenchmark.com/)).

You can write the training code manually or use transformers.Trainer (see [this example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification)). Please make sure that your model's accuracy is at least __comparable__ with the above example for BERT.


__Option B:__ compare at least 3 pre-finetuned models (in addition to the above BERT model). For each model, report (1) its accuracy, (2) its speed, measured in samples per second in your hardware setup and (3) its size in megabytes. Please take care to compare models in equal setting, e.g. same CPU / GPU. Compile your results into a table and write a short (~half-page on top of a table) report, summarizing your findings.

### Option B

In [3]:
models_names = ["gchhablani/bert-base-cased-finetuned-qqp",
                "M-FAC/bert-tiny-finetuned-qqp",
                "Alireza1044/albert-base-v2-qqp",
                "Tomor0720/deberta-base-finetuned-qqp"]

models = {n: transformers.AutoModelForSequenceClassification.from_pretrained(n) for n in models_names}
tokenizers = {n: transformers.AutoTokenizer.from_pretrained(n) for n in models_names}

Downloading (…)lve/main/config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

In [None]:
result = {}
for m_n in models_names:
    tokenizer = tokenizers[m_n]
    if m_n != "gchhablani/bert-base-cased-finetuned-qqp":
        qqp_preprocessed = qqp.map(preprocess_function, batched=True)
    batch_size = 50
    validation_dataloader = DataLoader(
        qqp_preprocessed['validation'],
        batch_size=batch_size,
        collate_fn=transformers.default_data_collator,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    result[m_n] = val_accuracy(models[m_n], validation_dataloader)

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 809/809 [04:51<00:00,  2.78it/s]


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 809/809 [00:23<00:00, 34.07it/s]


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 809/809 [05:32<00:00,  2.44it/s]


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 809/809 [05:42<00:00,  2.36it/s]


In [None]:
qqp_preprocessed['validation']

Dataset({
    features: ['text1', 'text2', 'label', 'idx', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40430
})

In [None]:
result_df = pd.DataFrame.from_dict(result, orient='index',
                       columns=['Accuracy', 'Speed (samples/sec)', 'Size (MB)', 'Num of parameters'])
result_df

Unnamed: 0,Accuracy,Speed (samples/sec),Size (MB),Num of parameters
gchhablani/bert-base-cased-finetuned-qqp,0.908385,138.929473,413,108311810
M-FAC/bert-tiny-finetuned-qqp,0.844027,1701.861514,16,4386178
Alireza1044/albert-base-v2-qqp,0.904972,121.752696,44,11685122
Tomor0720/deberta-base-finetuned-qqp,0.912763,118.147058,531,139193858


In general, the pretrained models of the Bert, Albert, and Deberta architectures show relatively similar accuracies (\~90.5) and computation speeds (\~120 per second). (although it’s strange that the lighter (50 MB) Albert model works just as slowly.

A lighter version of Bert - Bert-tiny has more modest results (\~84), but the speed increases 14 times (~1700)



### Task 3: try the full pipeline (2 points)

Finally, it is time to use your model to find duplicate questions.
Please implement a function that takes a question and finds top-5 potential duplicates in the training set. For now, it is fine if your function is slow, as long as it yields correct results.

Showcase how your function works with at least 5 examples.

In [56]:
 ['question']*5

['question', 'question', 'question', 'question', 'question']

In [51]:
import numpy as np
def find_duplicates(question, model, tokenizer_question, top_k=5):
    # preprocess_function
    MAX_LENGTH = 128
    def preprocess_function_for_question(examples, question=question):
        result = tokenizer_question(
            examples['text1'], [question]*len(examples['text1']),
            padding='max_length', max_length=MAX_LENGTH, truncation=True
        )
        return result

    qqp_preprocessed_question = qqp.map(preprocess_function_for_question, batched=True)

    batch_size = 200
    question_dataloader = DataLoader(
        qqp_preprocessed_question['train'],
        batch_size=batch_size,
        collate_fn=transformers.default_data_collator,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()


    all_preds = []
    all_text = []

    with torch.no_grad():
          for i, batch in enumerate(tqdm(question_dataloader)):
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              token_type_ids = batch['token_type_ids'].to(device)


              predicted = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids)
              all_preds.extend(predicted.logits.tolist())
              all_text.extend(batch['input_ids'].tolist())

    all_text = np.array(all_text)
    all_preds = np.array(all_preds)[:,1]
    indx = np.argsort(-all_preds)[:top_k]

    texts_out = [tokenizer.decode(t, skip_special_tokens=True)[:-len(question)] for t in all_text[indx]]
    return texts_out

question = 'Hi, how are you'
find_duplicates(question, model=models['M-FAC/bert-tiny-finetuned-qqp'], tokenizer_question=tokenizers['M-FAC/bert-tiny-finetuned-qqp'], top_k=5)

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 1820/1820 [02:33<00:00, 11.87it/s]


['how do i control my emotions and anger? ',
 'how do i control my emotions and anger? ',
 'what are some quick ways to control your anger? ',
 'what are some quick ways to control your anger? ',
 'how do we control our emotions? ']

In [53]:
questions = ['What can one do after MBBS?',
             'My name is Dima',
             'I like dog and cats',
             'The London is the capital of Great Britan']

for q in questions:
    res = find_duplicates(q, model=models['M-FAC/bert-tiny-finetuned-qqp'], tokenizer_question=tokenizers['M-FAC/bert-tiny-finetuned-qqp'], top_k=5)
    print(q)
    print(res)
    print('##########')

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

100%|██████████| 1820/1820 [02:00<00:00, 15.15it/s]


What can one do after MBBS?
['what do you think about the idea of narendra modi about 500 & 1000 notes? ', 'what do you think about the idea of narendra modi about 500 & 1000 notes? ', 'what do you think about the idea of narendra modi about 500 & 1000 notes? ', 'what do you think about the idea of narendra modi about 500 & 1000 notes? ', 'what do you think about the idea of narendra modi about 500 & 1000 notes? ']
##########


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

KeyboardInterrupt: ignored

##### Trash

In [5]:
question = "HI, how are you"
tokenizer =  tokenizers['M-FAC/bert-tiny-finetuned-qqp']
model = models['M-FAC/bert-tiny-finetuned-qqp']
MAX_LENGTH=128
def preprocess_function_for_question(examples, question=question):
    result = tokenizer(
        examples['text1'], [question]*len(examples['text1']),
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    return result

qqp_preprocessed_question = qqp.map(preprocess_function_for_question, batched=True)

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

In [6]:
batch_size = 50
question_dataloader = DataLoader(
        qqp_preprocessed_question['train'],
        batch_size=batch_size,
        collate_fn=transformers.default_data_collator,
        shuffle=False,
        pin_memory=True
    )

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_text = []

with torch.no_grad():
      for i, batch in enumerate(tqdm(question_dataloader)):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          token_type_ids = batch['token_type_ids'].to(device)


          predicted = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
          all_preds.extend(predicted.logits.tolist())
          all_text.extend(batch['input_ids'].tolist())

all_text = np.array(all_text)
all_preds = np.array(all_preds)[:,1]
indx = np.argsort(-all_preds)[:5]

texts_out = [tokenizer.decode(t, skip_special_tokens=True)[:-len(question)] for t in all_text[indx]]
texts_out

100%|██████████| 7277/7277 [03:55<00:00, 30.86it/s]
  all_preds = np.array(all_preds)[:,1]


IndexError: ignored

__Bonus:__ for bonus points, try to find a way to run the function faster than just passing over all questions in a loop. For isntance, you can form a short-list of potential candidates using a cheaper method, and then run your tranformer on that short list. If you opted for this solution, please keep both the original implementation and the optimized one - and explain briefly what is the difference there.