In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from accelerate import Accelerator, notebook_launcher
from huggingface_hub import Repository, get_full_repo_name, notebook_login
import evaluate
from tqdm.auto import tqdm
import numpy as np
import collections
import json

In [2]:
def reformat_json_file_path(json_file_path):

    # Load the original JSON data
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    # Reformat the data
    reformatted_data = {
        'data': [
            {
                'id': qa['id'],
                'title': elem['title'].strip(),
                'context': paragraph['context'].strip(),
                'question': qa['question'].strip(),
                'answers': {
                    'answer_start': [answer['answer_start'] for answer in qa['answers']],
                    'text': [answer['text'] for answer in qa['answers']]
                }
            }
            for elem in json_data['data']
            for paragraph in elem['paragraphs']
            for qa in paragraph['qas']
        ]
    }

    # Generate the output file path
    output_json_file_path = 'out_' + json_file_path

    # Save the reformatted data to a new file
    with open(output_json_file_path, 'w') as file:
        json.dump(reformatted_data, file)

    return output_json_file_path

def load_and_reformat_spoken_squad_dataset():
    """Loads and reformats the SpokenSQuAD dataset, preparing it for dataset loading."""
    dataset_files = {
        'train': 'spoken_train-v1.1.json',
        'validation': 'spoken_test-v1.1.json',         # NO NOISE: 22.73% WER
        'test_WER44': 'spoken_test-v1.1_WER44.json',   # NOISE V1: 44.22% WER
        'test_WER54': 'spoken_test-v1.1_WER54.json',   # NOISE V2: 54.82% WER
    }

    # Reformat each file and update paths
    reformatted_files = {key: reformat_json_file_path(path) for key, path in dataset_files.items()}

    # Load dataset with reformatted files (Assuming load_dataset is defined elsewhere)
    spoken_squad_dataset = load_dataset('json', data_files=reformatted_files, field='data')
    
    return spoken_squad_dataset

if __name__ == '__main__':
    print("Loading and reformating SpokenSQuAD data...")
    spoken_squad_dataset = load_and_reformat_spoken_squad_dataset()


Loading and reformating SpokenSQuAD data...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test_WER44 split: 0 examples [00:00, ? examples/s]

Generating test_WER54 split: 0 examples [00:00, ? examples/s]

In [3]:
model_checkpoint = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"

print("Setting up the Model and Tokenizer")
print(f"Using Model Checkpoint: {model_checkpoint}")

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print("\nOverview of Model Architecture: BERT (large, uncased) with Whole Word Masking, fine-tuned on SQuAD, accompanied by a Linear layer for producing two logits:")
print(model)

Setting up the Model and Tokenizer
Using Model Checkpoint: google-bert/bert-large-uncased-whole-word-masking-finetuned-squad


Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Overview of Model Architecture: BERT (large, uncased) with Whole Word Masking, fine-tuned on SQuAD, accompanied by a Linear layer for producing two logits:
BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (ou

In [4]:
max_length = 512
stride = 64

def tokenize_and_label_examples(examples):

    # Tokenize questions and contexts with specified constraints
    tokenized_examples = tokenizer(
        [question.strip() for question in examples['question']],  # Cleaned questions
        examples['context'],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract offset mappings and map overflow tokens to original samples
    offset_mappings = tokenized_examples.pop('offset_mapping')
    overflow_to_original_sample = tokenized_examples.pop('overflow_to_sample_mapping')
    
    # Initialize containers for start and end positions
    start_positions, end_positions = [], []

    # Iterate through each tokenized example
    for i, offsets in enumerate(offset_mappings):
        # Map back to original sample
        original_sample_index = overflow_to_original_sample[i]
        answer = examples['answers'][original_sample_index]
        start_char, end_char = answer['answer_start'][0], answer['answer_start'][0] + len(answer['text'][0])

        # Identify sequence IDs to distinguish question from context
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Determine the context's start and end within the token sequence
        context_start, context_end = next(idx for idx, sid in enumerate(sequence_ids) if sid == 1), \
                                     next(idx for idx, sid in enumerate(reversed(sequence_ids), 1) if sid == 1) - 1

        # Default to (0, 0) if answer is outside the windowed context
        start_pos, end_pos = 0, 0
        if not (offsets[context_start][0] > start_char or offsets[context_end][1] < end_char):
            # Find start position within context
            start_pos = next(idx for idx, (start, _) in enumerate(offsets) if start <= start_char <= end_char)
            # Find end position within context
            end_pos = next(idx for idx, (_, end) in enumerate(offsets) if start_char <= end_char <= end)

        start_positions.append(start_pos)
        end_positions.append(end_pos)
    
    # Update the tokenized examples with the computed start/end positions
    tokenized_examples.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })

    return tokenized_examples

print("Tokenizing and labeling dataset...")

# Apply the preprocessing function to the training set
preprocessed_train_dataset = spoken_squad_dataset['train'].map(
    tokenize_and_label_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['train'].column_names
)
print(preprocessed_train_dataset)

Tokenizing and labeling dataset...


Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 37130
})


In [5]:
def preprocess_validation_examples(examples):
    # Strip leading and trailing spaces from questions
    questions = [q.strip() for q in examples['question']]
    # Tokenize questions and contexts with specific parameters
    tokenized_inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_length,
        truncation='only_second',
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
    )

    # Remove keys not used for validation
    overflow_map = tokenized_inputs.pop('overflow_to_sample_mapping')
    adjusted_example_ids = []

    for i, _ in enumerate(tokenized_inputs['input_ids']):
        original_idx = overflow_map[i]
        adjusted_example_ids.append(examples['id'][original_idx])

        # Adjust offset mapping for validation purpose
        sequence_ids = tokenized_inputs.sequence_ids(i)
        offsets = tokenized_inputs['offset_mapping'][i]
        tokenized_inputs['offset_mapping'][i] = [
            offset if sequence_ids[idx] == 1 else None for idx, offset in enumerate(offsets)
        ]

    tokenized_inputs['example_id'] = adjusted_example_ids
    return tokenized_inputs


# Preprocess validation and test datasets for evaluation
print("Processing validation and test datasets for model evaluation...")

validation_data = spoken_squad_dataset['validation'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['validation'].column_names
)

test_set_WER44_data = spoken_squad_dataset['test_WER44'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER44'].column_names
)

test_set_WER54_data = spoken_squad_dataset['test_WER54'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER54'].column_names
)

print(validation_data)



Processing validation and test datasets for model evaluation...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 5376
})


In [6]:
metric = evaluate.load("squad")

n_best = 18
max_answer_length = 25
def compute_answer_metrics(start_logits, end_logits, features, examples):
    # Map each example to its corresponding features
    example_to_feature_map = collections.defaultdict(list)
    for index, feature in enumerate(features):
        example_to_feature_map[feature["example_id"]].append(index)

    predictions = []
    # Iterate over each example to find the best answer
    for example in tqdm(examples, desc="Evaluating"):
        example_id = example["id"]
        context = example["context"]
        answer_candidates = []

        # Consider all features related to the current example
        for feature_idx in example_to_feature_map[example_id]:
            start_scores = start_logits[feature_idx]
            end_scores = end_logits[feature_idx]
            offset_mapping = features[feature_idx]["offset_mapping"]

            # Find the top N start and end positions
            start_positions = np.argsort(start_scores)[-n_best:][::-1]
            end_positions = np.argsort(end_scores)[-n_best:][::-1]

            # Evaluate all possible start-end pairs
            for start_pos in start_positions:
                for end_pos in end_positions:
                    if offset_mapping[start_pos] is None or offset_mapping[end_pos] is None:
                        continue
                    if end_pos < start_pos or (end_pos - start_pos + 1) > max_answer_length:
                        continue

                    answer_text = context[offset_mapping[start_pos][0]:offset_mapping[end_pos][1]]
                    logit_score = start_scores[start_pos] + end_scores[end_pos]
                    answer_candidates.append({"text": answer_text, "logit_score": logit_score})

        # Choose the answer with the highest score
        if answer_candidates:
            best_answer = max(answer_candidates, key=lambda x: x["logit_score"])
            prediction_text = best_answer["text"]
        else:
            prediction_text = ""

        predictions.append({"id": example_id, "prediction_text": prediction_text})

    # Prepare references for metric computation
    references = [{"id": example["id"], "answers": example["answers"]} for example in examples]

    # Compute and return the evaluation metric
    return metric.compute(predictions=predictions, references=references)

In [7]:
preprocessed_train_dataset.set_format("torch")
validation_set = validation_data.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
test_WER44_set = test_set_WER44_data.remove_columns(["example_id", "offset_mapping"])
test_WER44_set.set_format("torch")
test_WER54_set = test_set_WER54_data.remove_columns(["example_id", "offset_mapping"])
test_WER54_set.set_format("torch")

print("Creating train dataloader...")
train_dataloader = DataLoader(
    preprocessed_train_dataset, 
    shuffle = True, 
    collate_fn=default_data_collator, 
    batch_size=8
)

print("Creating validation dataloader...")
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)
print("Creating test V1 dataloader...")
test_WER44_dataloader = DataLoader(
    test_WER44_set, collate_fn=default_data_collator, batch_size=8
)
print("Creating test V2 dataloader...")
test_WER54_dataloader = DataLoader(
    test_WER54_set, collate_fn=default_data_collator, batch_size=8
)

Creating train dataloader...
Creating validation dataloader...
Creating test V1 dataloader...
Creating test V2 dataloader...


In [8]:
def evaluate_model(model, dataloader, dataset, dataset_before_preprocessing, accelerator=None):
    if not accelerator: 
        accelerator = Accelerator(mixed_precision='fp16')
        model, dataloader = accelerator.prepare(
            model, dataloader
        )
    
    model.eval()
    start_logits = []
    end_logits = []
    for batch in tqdm(dataloader):
        with torch.no_grad(): 
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(dataset)]
    end_logits = end_logits[: len(dataset)]

    metrics = compute_answer_metrics(
        start_logits, end_logits, dataset, dataset_before_preprocessing
    )
    return metrics

In [9]:
output_dir = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
def train_model(model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, epochs = 2):
    training_steps = epochs * len(train_dataloader)

    accelerator = Accelerator(mixed_precision='fp16')
    optimizer = AdamW(model.parameters(), lr = 3e-5)

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=training_steps,
    )

    progress_bar = tqdm(range(training_steps))

    for epoch in range(epochs):
        # train for 1 epoch
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # evaluate after each epoch 
        accelerator.print("Evaluation...")
        metrics = evaluate_model(model, eval_dataloader, validation_data, spoken_squad_dataset['validation'], accelerator)
        print(f"epoch {epoch}:", metrics)

        # save and upload 
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
notebook_launcher(train_model, num_processes=1)

Launching training on one GPU.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  0%|          | 0/9284 [00:00<?, ?it/s]

Evaluation...


  0%|          | 0/672 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5351 [00:00<?, ?it/s]

epoch 0: {'exact_match': 47.35563446084844, 'f1': 63.083867863358144}
Evaluation...


  0%|          | 0/672 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5351 [00:00<?, ?it/s]

epoch 1: {'exact_match': 29.26555783965614, 'f1': 52.83413955304319}


In [10]:
### EVALUATE FINETUNED MODEL 
print("Performing evaluation on the Test Set")
test_eval_results = evaluate_model(model, eval_dataloader, validation_data, spoken_squad_dataset['validation'])
print("Performing evaluation on the Test V1 Set with V1 noise")
test_v1_eval_results = evaluate_model(model, test_WER44_dataloader, test_set_WER44_data, spoken_squad_dataset['test_WER44'])
print("Performing evaluation on the Test V2 Set with V2 noise")
test_v2_eval_results = evaluate_model(model, test_WER54_dataloader, test_set_WER54_data, spoken_squad_dataset['test_WER54'])

print(f"Test Set (No Noise - 22.73% WER) - Exact Match: {test_eval_results['exact_match']}, F1 Score: {test_eval_results['f1']}")
print(f"Test V1 Set (V1 Noise - 44.22% WER) - Exact Match: {test_v1_eval_results['exact_match']}, F1 Score: {test_v1_eval_results['f1']}")
print(f"Test V2 Set (V2 Noise - 54.82% WER) - Exact Match: {test_v2_eval_results['exact_match']}, F1 Score: {test_v2_eval_results['f1']}")

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Performing evaluation on the Test Set


  0%|          | 0/672 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Performing evaluation on the Test V1 Set with V1 noise


  0%|          | 0/672 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Performing evaluation on the Test V2 Set with V2 noise


  0%|          | 0/672 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5351 [00:00<?, ?it/s]

Test Set (No Noise - 22.73% WER) - Exact Match: 29.26555783965614, F1 Score: 52.83413955304319
Test V1 Set (V1 Noise - 44.22% WER) - Exact Match: 20.08970285927864, F1 Score: 39.25835101485107
Test V2 Set (V2 Noise - 54.82% WER) - Exact Match: 14.44589796299757, F1 Score: 29.184976143129315
