In [1]:
import torch
from transformers import AutoTokenizer, RobertaForQuestionAnswering

## Load SQuad Data

SQuad dataset fields
- answers: the starting location of the answer token and the answer text.
- context: background information from which the model needs to extract the answer.
- question: the question a model should answer.

In [2]:
import pandas as pd
from datasets import load_dataset

In [3]:
squad = load_dataset("squad_v2")

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

### Squad preprocess

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
pad_on_right = tokenizer.padding_side == "right"
# 
def preprocess_function(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
                
    return tokenized_examples

In [6]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 131754
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 12134
    })
})

In [8]:
from transformers import default_data_collator

data_collator = default_data_collator

2024-07-23 17:25:05.495543: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-23 17:25:05.495672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-23 17:25:05.639105: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 131754
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 12134
    })
})

## Model Finetune

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="distilbert-base-squad",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.2858,1.287675
2,1.0288,1.257078
3,0.8718,1.385032


TrainOutput(global_step=12354, training_loss=1.198715941157564, metrics={'train_runtime': 8053.9146, 'train_samples_per_second': 49.077, 'train_steps_per_second': 1.534, 'total_flos': 3.873165421863629e+16, 'train_loss': 1.198715941157564, 'epoch': 3.0})

In [13]:
trainer.save_model("distilbert-squad-trained")

## Evaluation

In [14]:
# import torch

# for batch in trainer.get_eval_dataloader():
#     break
# batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
# with torch.no_grad():
#     output = trainer.model(**batch)
# output.keys()

In [15]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(f"Using device: {device}")
# model = model.to(device)

In [16]:
# from tqdm import tqdm

# batch_size = 32
# all_start_logits = []
# all_end_logits = []

# for i in tqdm(range(0, len(eval_set_for_model), batch_size)):
#     batch = eval_set_for_model[i:i+batch_size]
#     batch = {k: torch.tensor(v).to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)
    
#     all_start_logits.extend(outputs.start_logits.cpu().numpy())
#     all_end_logits.extend(outputs.end_logits.cpu().numpy())

In [17]:
# import numpy as np
# from datasets import load_metric
# import collections
# from tqdm.auto import tqdm

# n_best = 15
# max_answer_length = 30

# def compute_answer(start_logits, end_logits, features, examples):
#     example_to_features = collections.defaultdict(list)
#     for idx, feature in enumerate(features):
#         example_to_features[feature["example_id"]].append(idx)

#     predicted_answers = []
#     for example in tqdm(examples):
#         example_id = example["id"]
#         context = example["context"]
#         answers = []

#         # Loop through all features associated with that example
#         for feature_index in example_to_features[example_id]:
#             start_logit = start_logits[feature_index]
#             end_logit = end_logits[feature_index]
#             offsets = features[feature_index]["offset_mapping"]

#             start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
#             end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
#             for start_index in start_indexes:
#                 for end_index in end_indexes:
#                     # Skip answers that are not fully in the context
#                     if offsets[start_index] is None or offsets[end_index] is None:
#                         continue
#                     # Skip answers with a length that is either < 0 or > max_answer_length
#                     if (
#                         end_index < start_index
#                         or end_index - start_index + 1 > max_answer_length
#                     ):
#                         continue

#                     answer = {
#                         "text": context[offsets[start_index][0] : offsets[end_index][1]],
#                         "logit_score": start_logit[start_index] + end_logit[end_index],
#                     }
#                     answers.append(answer)
        
#         # Select the answer with the best score
#         if len(answers) > 0:
#             try:
#                 best_answer = max(answers, key=lambda x: x["logit_score"])
#             except:
#                 print(answers)
#             predicted_answers.append(
#                 {"id": example_id, "prediction_text": best_answer["text"]}
#             )
#         else:
#             predicted_answers.append({"id": example_id, "prediction_text": ""})

#     theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
#     return predicted_answers, theoretical_answers

In [18]:
# predicted_answers, theoretical_answers = compute_answer(all_start_logits, all_end_logits, tokenized_squad_val, squad_val)

In [19]:
# predicted_answers[:5]

In [20]:
# theoretical_answers[:5]

In [21]:
# !pip install evaluate

In [22]:
# import evaluate
# metric = evaluate.load("squad")

In [23]:
# results = metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [24]:
# print("Results:")
# for key, value in results.items():
#     print(f"{key}: {value:.4f}")