In [1]:
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from datasets import load_dataset
from accelerate import Accelerator, notebook_launcher
import evaluate
import collections
from tqdm.auto import tqdm


2024-11-07 13:25:46.258317: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 13:25:48.172323: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731003948.801076  915851 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731003949.026590  915851 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 13:25:51.476636: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# Reformat a JSON file and save the result to a new file
def modify_and_save_json(input_json):
    try:
        with open(input_json, 'r') as f:
            json_data = json.load(f)
    except IOError as e:
        print(f"Error opening {input_json}: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {input_json}: {e}")
        return None

    examples = []
    for item in json_data['data']:
        title = item['title'].strip()
        for paragraph in item['paragraphs']:
            context = paragraph['context'].strip()
            for qa in paragraph['qas']:
                example = {'id': qa['id'], 'title': title, 'context': context, 'question': qa['question'].strip(), 'answers': {'answer_start': [answer["answer_start"] for answer in qa['answers']], 'text': [answer["text"] for answer in qa['answers']]}}
                examples.append(example)
    
    output_data = {'data': examples}
    output_file = os.path.join(os.path.dirname(input_json), 'modified_' + os.path.basename(input_json))

    try:
        with open(output_file, 'w') as f:
            json.dump(output_data, f)
    except IOError as e:
        print(f"Error writing to {output_file}: {e}")
        return None

    return output_file

data_paths = {
    'train': '/home/tvaspar/DL_HW3/spoken_train-v1.1.json',
    'validation': '/home/tvaspar/DL_HW3/spoken_test-v1.1.json',
    'test_WER44': '/home/tvaspar/DL_HW3/spoken_test-v1.1_WER44.json',
    'test_WER54': '/home/tvaspar/DL_HW3/spoken_test-v1.1_WER54.json'
}

# Iterate over the paths of the original data files using a dictionary comprehension
# and pass each file path to the modify_and_save_json function.
# This will return a new dictionary containing the paths of the processed files.
modified_data_files = {key: modify_and_save_json(path) for key, path in data_paths.items() if modify_and_save_json(path)}

# Now, modified_data_files contains the paths of the processed files.
# Use these paths to load the dataset.
spoken_squad_dataset = load_dataset('json', data_files=modified_data_files, field='data')
print("Loading SpokenSQuAD data completed")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test_WER44 split: 0 examples [00:00, ? examples/s]

Generating test_WER54 split: 0 examples [00:00, ? examples/s]

Loading SpokenSQuAD data completed


In [3]:
# Load the model and tokenizer
checkpoint_model = "bert-base-uncased" 
print(f"Loading the BERT model and tokenizer from checkpoint '{checkpoint_model}'...") 

# Load the pre-trained Question Answering model from the specified checkpoint.
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint_model)
print(f"Model '{checkpoint_model}' successfully loaded for Question Answering tasks.")

# Load the tokenizer associated with the specified checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint_model)
print(f"Tokenizer for model '{checkpoint_model}' successfully loaded.")


Loading the BERT model and tokenizer from checkpoint 'bert-base-uncased'...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 'bert-base-uncased' successfully loaded for Question Answering tasks.
Tokenizer for model 'bert-base-uncased' successfully loaded.


In [4]:
# Define data preprocessing functions
max_length = 384  # Maximum length of the tokenized input sequences
stride = 64  # The stride size for splitting long documents into chunks

def preprocess_training_examples(examples):
    questions = [question.strip() for question in examples['question']]
    tokenized_inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    offset_mapping = tokenized_inputs.pop('offset_mapping')
    sample_mapping = tokenized_inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_mapping[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer["text"][0])
        sequence_ids = tokenized_inputs.sequence_ids(i)

        # find start and end of the context
        idx = 0
        while sequence_ids[idx] != 1: 
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if answer not fully inside context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    tokenized_inputs['start_positions'] = start_positions
    tokenized_inputs['end_positions'] = end_positions
    return tokenized_inputs


def process_validation_examples(examples):
    questions = [question.strip() for question in examples['question']]
    tokenized_inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    sample_mapping = tokenized_inputs.pop('overflow_to_sample_mapping')
    example_ids = []

    for i in range(len(tokenized_inputs['input_ids'])):
        sample_idx = sample_mapping[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = tokenized_inputs.sequence_ids(i)
        offsets = tokenized_inputs['offset_mapping'][i]
        tokenized_inputs["offset_mapping"][i] = [
            offset if sequence_ids[k] == 1 else None for k, offset in enumerate(offsets)
        ]

    tokenized_inputs['example_id'] = example_ids
    return tokenized_inputs


print("Starting preprocessing of training data with tokenization and extraction of answer positions...")

train_dataset = spoken_squad_dataset['train'].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['train'].column_names
)

print("Tokenization and preprocessing of validation dataset (clean data, 22.73% WER) underway...")

validation_dataset = spoken_squad_dataset['validation'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['validation'].column_names
)

print("Preprocessing test dataset with moderate noise level (44.22% WER) for evaluation...")

test_WER44_dataset = spoken_squad_dataset['test_WER44'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER44'].column_names
)

print("Preprocessing test dataset with high noise level (54.82% WER) for robustness assessment...")

test_WER54_dataset = spoken_squad_dataset['test_WER54'].map(
    process_validation_examples,
    batched=True,
    remove_columns=spoken_squad_dataset['test_WER54'].column_names
)


Starting preprocessing of training data with tokenization and extraction of answer positions...


Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

Tokenization and preprocessing of validation dataset (clean data, 22.73% WER) underway...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing test dataset with moderate noise level (44.22% WER) for evaluation...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing test dataset with high noise level (54.82% WER) for robustness assessment...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

In [5]:
# Convert the datasets to a format compatible with PyTorch models.

train_dataset.set_format("torch")
print("Converted the training dataset to PyTorch tensor format.")

validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
print("Prepared the validation dataset by removing 'example_id' and 'offset_mapping' columns and converting to PyTorch tensor format.")

test_WER44_set = test_WER44_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER44_set.set_format("torch")
print("Prepared the Test WER44 dataset (simulating 44% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.")

test_WER54_set = test_WER54_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER54_set.set_format("torch")
print("Prepared the Test WER54 dataset (simulating 54% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.")

print("Initializing the DataLoader for the training dataset with shuffling and a batch size of 8 to ensure varied mini-batch combinations during training.")
train_loader = DataLoader(
    train_dataset, 
    shuffle = True, 
    collate_fn=default_data_collator, 
    batch_size=8
)

print("Initializing the DataLoader for the validation dataset with a batch size of 8 for model performance evaluation on unseen clean data.")
eval_loader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)
print("Initializing the DataLoader for the Test WER44 dataset with a batch size of 8 to evaluate model robustness under moderate noise conditions.")
test_WER44_loader= DataLoader(
    test_WER44_set, collate_fn=default_data_collator, batch_size=8
)
print("Initializing the DataLoader for the Test WER54 dataset with a batch size of 8 to evaluate model robustness under high noise conditions.")
test_WER54_loader = DataLoader(
    test_WER54_set, collate_fn=default_data_collator, batch_size=8
)


Converted the training dataset to PyTorch tensor format.
Prepared the validation dataset by removing 'example_id' and 'offset_mapping' columns and converting to PyTorch tensor format.
Prepared the Test WER44 dataset (simulating 44% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.
Prepared the Test WER54 dataset (simulating 54% Word Error Rate) by removing unnecessary columns and converting to PyTorch tensor format.
Initializing the DataLoader for the training dataset with shuffling and a batch size of 8 to ensure varied mini-batch combinations during training.
Initializing the DataLoader for the validation dataset with a batch size of 8 for model performance evaluation on unseen clean data.
Initializing the DataLoader for the Test WER44 dataset with a batch size of 8 to evaluate model robustness under moderate noise conditions.
Initializing the DataLoader for the Test WER54 dataset with a batch size of 8 to evaluate model robustness under high n

In [6]:
# Define evaluation metrics and evaluation function
metric = evaluate.load("squad")  # Load the SQuAD evaluation metric

n_best = 20  # Number of top predictions to consider for each example
max_answer_length = 30  # Maximum length of an answer that can be generated

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)  # Map each example_id to its corresponding features
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)
    
    predicted_answers = []
    for example in tqdm(examples):  # Iterate through each example
        example_id = example["id"]
        context = example["context"]
        answers = []
        
        # Loop through all features associated with an example ID
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]  # Start logit scores for this feature
            end_logit = end_logits[feature_index]  # End logit scores for this feature
            offsets = features[feature_index]["offset_mapping"]  # Token offsets for this feature
            
            # Get indices of the n_best start and end logits
            start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully within the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with invalid lengths
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    
                    # Construct an answer candidate
                    answer = {
                        "text": context[offsets[start_index][0]: offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index]
                    }
                    answers.append(answer)
        
        # Select the answer with the highest logit score
        if answers:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    
    # Compare predicted answers with the actual answers
    theory_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theory_answers)


In [7]:
def train_model(model=model, train_loader=train_loader, eval_loader=eval_loader, epochs=3):
    print(f"Starting model training for {epochs} epochs, each with {len(train_loader)} batches.")
    
    training_steps = epochs * len(train_loader)  # Total training steps calculation

    # Initialize the Accelerator for mixed precision training
    accelerator = Accelerator(mixed_precision='fp16')
    print("Accelerator initialized for mixed precision ('fp16') training.")

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)
    print("Optimizer setup with learning rate 2e-5.")

    # Prepare model, optimizer, and dataloaders for Accelerator
    model, optimizer, train_loader, eval_loader = accelerator.prepare(
        model, optimizer, train_loader, eval_loader
    )

    # Learning rate scheduler initialization
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=training_steps,
    )

    progress_bar = tqdm(range(training_steps), desc="Training Progress")  # Training progress bar setup

    for epoch in range(epochs):  # Loop over epochs
        print(f"\nEpoch {epoch+1}/{epochs} - Training:")
        model.train()  # Set model to training mode

        total_loss = 0  # Initialize total loss for averaging
        correct_predictions = 0  # Initialize correct predictions counter

        for step, batch in enumerate(train_loader):  # Iterate over training batches
            outputs = model(**batch)
            
            loss = outputs.loss
            total_loss += loss.item()  # Accumulate loss
            
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Model evaluation at the end of each epoch
        print("\nEvaluating model performance on the validation set...")
        metrics = evaluate_model(model, eval_loader, validation_dataset, spoken_squad_dataset['validation'], accelerator)
        
        # Calculate and print average training loss for the epoch
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average Training Loss: {average_loss:.4f}")

        # Print validation metrics for accuracy assessment
        print(f"Validation Results - Epoch {epoch+1}: {metrics}")

        # Save the model and tokenizer at the end of each epoch
        output_dir = f"./model_save/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model_to_save = accelerator.unwrap_model(model)
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Model and tokenizer saved in '{output_dir}'")

    print("\nTraining completed successfully.")


In [8]:
def evaluate_model(model, dataloader, dataset, dataset_before_preprocessing, accelerator=None):
    if not accelerator:
        print("Initializing Accelerator for mixed precision (fp16) evaluation...")
        accelerator = Accelerator(mixed_precision='fp16')
        model, dataloader = accelerator.prepare(model, dataloader)
    
    print("Setting the model to evaluation mode for performance assessment...")
    model.eval()
    start_logits, end_logits = [], []

    print("Evaluating model performance on the dataset...")
    for batch in tqdm(dataloader, desc="Evaluation Progress"):
        with torch.no_grad():
            outputs = model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    # Concatenate and truncate logits to align with the dataset size
    start_logits, end_logits = np.concatenate(start_logits)[:len(dataset)], np.concatenate(end_logits)[:len(dataset)]

    print("Computing evaluation metrics based on model predictions...")
    metrics = compute_metrics(start_logits, end_logits, dataset, dataset_before_preprocessing)
    
    return metrics

print("Initiating model fine-tuning process...")
notebook_launcher(train_model, num_processes=1)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initiating model fine-tuning process...
Launching training on one GPU.
Starting model training for 3 epochs, each with 4664 batches.
Accelerator initialized for mixed precision ('fp16') training.
Optimizer setup with learning rate 2e-5.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training Progress:   0%|          | 0/13992 [00:00<?, ?it/s]


Epoch 1/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Epoch 1 - Average Training Loss: 1.7089
Validation Results - Epoch 1: {'exact_match': 61.48383479723416, 'f1': 72.1856383081878}
Model and tokenizer saved in './model_save/epoch_1'

Epoch 2/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Epoch 2 - Average Training Loss: 0.9422
Validation Results - Epoch 2: {'exact_match': 63.7077181835171, 'f1': 73.93310041695544}
Model and tokenizer saved in './model_save/epoch_2'

Epoch 3/3 - Training:

Evaluating model performance on the validation set...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Epoch 3 - Average Training Loss: 0.5974
Validation Results - Epoch 3: {'exact_match': 63.68903008783405, 'f1': 74.11362060254861}
Model and tokenizer saved in './model_save/epoch_3'

Training completed successfully.


In [9]:
# Results Evaluation

test_metrics = evaluate_model(model, eval_loader, validation_dataset, spoken_squad_dataset['validation'])
test_wer44_metrics = evaluate_model(model, test_WER44_loader, test_WER44_dataset, spoken_squad_dataset['test_WER44'])
test_wer54_metrics = evaluate_model(model, test_WER54_loader, test_WER54_dataset, spoken_squad_dataset['test_WER54'])

# Model Evaluation Summary
print("\nModel Evaluation Summary\n")

# Validation Set results
print("Validation Set:")
print(f"  - F1 Score: {test_metrics['f1']:.2f}%\n")

# Test WER44 Set results
print("Test WER44 Set:")
print(f"  - F1 Score: {test_wer44_metrics['f1']:.2f}%\n")

# Test WER54 Set results
print("Test WER54 Set:")
print(f"  - F1 Score: {test_wer54_metrics['f1']:.2f}%\n")


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/678 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/679 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Initializing Accelerator for mixed precision (fp16) evaluation...
Setting the model to evaluation mode for performance assessment...
Evaluating model performance on the dataset...


Evaluation Progress:   0%|          | 0/679 [00:00<?, ?it/s]

Computing evaluation metrics based on model predictions...


  0%|          | 0/5351 [00:00<?, ?it/s]


Model Evaluation Summary

Validation Set:
  - F1 Score: 74.11%

Test WER44 Set:
  - F1 Score: 55.77%

Test WER54 Set:
  - F1 Score: 42.29%



In [10]:
import requests
import json
import torch
import torch.nn as nn
import os
from tqdm import tqdm
import transformers
from transformers import BertModel, BertTokenizerFast, AdamW
# AutoTokenizer, AutoModelForQuestionAnswering, BertTokenizer, BertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR
import matplotlib.pyplot as plt

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

ques_num = 0
pos_num = 0
impos_num = 0

cuda


In [12]:

def get_data(path): 
    #read each file and retrieve the contexts, qustions and answers
    with open(path, 'rb') as f:
        raw_data = json.load(f)
    contexts = []
    questions = []
    answers = []
    num_q = 0
    num_pos = 0
    num_imp = 0

    for group in raw_data['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question= qa['question']
                num_q  = num_q  +1
                for answer in qa['answers']:
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)
    return num_q, num_pos, num_imp, contexts, questions, answers

In [13]:
num_q, num_pos, num_imp, train_contexts, train_questions, train_answers = get_data('spoken_train-v1.1.json')
ques_num  = num_q
pos_num = num_pos
impos_num  = num_imp

In [14]:
num_q, num_pos, num_imp, valid_contexts, valid_questions, valid_answers = get_data('spoken_test-v1.1.json')

In [15]:
def update_answer_end(answers, contexts):
    for answer, context in zip(answers, contexts):
        answer['text'] = answer['text'].lower()
        answer['answer_end'] = answer['answer_start'] + len(answer['text'])

update_answer_end(train_answers, train_contexts)
update_answer_end(valid_answers, valid_contexts)

In [16]:
MAX_LENGTH = 512
PRETRAINED_MODEL= "bert-base-uncased"

doc_stride = 128
tokenizerFast = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)
is_padding_right = tokenizerFast.padding_side == "right"
train_contexts_truncated=[]
print(len(train_contexts))
for i in range(len(train_contexts)):
    if(len(train_contexts[i])>512):
        answer_start=train_answers[i]['answer_start']
        answer_end=train_answers[i]['answer_start']+len(train_answers[i]['text'])
        mid=(answer_start+answer_end)//2
        para_start=max(0,min(mid - MAX_LENGTH//2,len(train_contexts[i])-MAX_LENGTH))
        para_end = para_start + MAX_LENGTH 
        train_contexts_truncated.append(train_contexts[i][para_start:para_end])
        train_answers[i]['answer_start']=((512/2)-len(train_answers[i])//2)
    else:
        train_contexts_truncated.append(train_contexts[i])  
    
print(len(train_contexts_truncated))
train_encodings_fast = tokenizerFast(train_questions, train_contexts_truncated,  max_length = MAX_LENGTH,truncation=True,
        stride=doc_stride,
        padding=True)
valid_encodings_fast = tokenizerFast(valid_questions,valid_contexts,  max_length = MAX_LENGTH, truncation=True,stride=doc_stride,
        padding=True)
type(train_encodings_fast)

37111
37111


transformers.tokenization_utils_base.BatchEncoding

In [17]:
def get_answer_start_and_end_train(idx):
    start_pos = 0
    end_pos = 0
    answer_encoding = tokenizerFast(train_answers[idx]['text'],  max_length = MAX_LENGTH, truncation=True, padding=True)
    for a in range( len(train_encodings_fast['input_ids'][idx]) -  len(answer_encoding['input_ids']) ): #len(train_encodings_fast['input_ids'][0])):
        match = True
        for i in range(1,len(answer_encoding['input_ids']) - 1):
            if (answer_encoding['input_ids'][i] != train_encodings_fast['input_ids'][idx][a + i]):
                match = False
                break
            if match:
                start_pos = a+1
                end_pos = a+i+1
                break
    return(start_pos, end_pos)


In [18]:

start_positions = []
end_positions = []
ctr = 0
for h in range(len(train_encodings_fast['input_ids'])):
    s, e = get_answer_start_and_end_train(h)
    start_positions.append(s)
    end_positions.append(e)
    if s==0:
        ctr = ctr + 1
    
train_encodings_fast.update({'start_positions': start_positions, 'end_positions': end_positions})
print(ctr)


353


In [19]:

def get_answer_start_and_end_valid(idx):
    start_pos = 0
    end_pos = 0
    answer_encoding = tokenizerFast(valid_answers[idx]['text'],  max_length = MAX_LENGTH, truncation=True, padding=True)
    for a in range( len(valid_encodings_fast['input_ids'][idx])  -  len(answer_encoding['input_ids'])   ): #len(train_encodings_fast['input_ids'][0])):
        match = True
        for i in range(1,len(answer_encoding['input_ids']) - 1):
            if (answer_encoding['input_ids'][i] != valid_encodings_fast['input_ids'][idx][a + i]):
                match = False
                break
            if match:
                start_pos = a+1
                end_pos = a+i+1
                break
    return(start_pos, end_pos)


In [20]:
start_positions = []
end_positions = []
ctr = 0
for h in range(len(valid_encodings_fast['input_ids']) ):
    #print(h)
    s, e = get_answer_start_and_end_valid(h)
    start_positions.append(s)
    end_positions.append(e)
    if s==0:
        ctr = ctr + 1

valid_encodings_fast.update({'start_positions': start_positions, 'end_positions': end_positions})
print(ctr)

146


In [21]:

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, i):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][i]),
            'token_type_ids': torch.tensor(self.encodings['token_type_ids'][i]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][i]),
            'start_positions': torch.tensor(self.encodings['start_positions'][i]),
            'end_positions': torch.tensor(self.encodings['end_positions'][i])
        }
    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = CustomDataset(train_encodings_fast)
valid_dataset = CustomDataset(valid_encodings_fast)

train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=1)

bert_model = BertModel.from_pretrained(PRETRAINED_MODEL)  #PRETRAINED_MODEL = "bert-base-uncased"


In [22]:

class QuestionAnsweringModel(nn.Module):
    def __init__(self):
        super(QuestionAnsweringModel, self).__init__()
        self.bert = bert_model
        self.drop_out = nn.Dropout(0.1)
        self.l1 = nn.Linear(768 * 2, 768 * 2)
        self.l2 = nn.Linear(768 * 2, 2)
        self.linear_relu_stack = nn.Sequential(
            self.drop_out,
            self.l1,
            nn.LeakyReLU(),
            self.l2 
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        hidden_state_layers = bert_output[2]
        out = torch.cat((hidden_state_layers[-1], hidden_state_layers[-3]), dim=-1)  # taking Start logits from last BERT layer, End Logits from third to last layer
        logits = self.linear_relu_stack(out)
        
        start_logits, end_logits = logits.split(1, dim=-1)
        
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits


In [23]:
model = QuestionAnsweringModel()

In [24]:

# my function to manually calculate Cross Entropy Loss
def custom_loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss_value= loss_fct(start_logits, start_positions)
    end_loss_value = loss_fct(end_logits, end_positions)
    total_loss = (start_loss_value + end_loss_value)/2
    return total_loss

def custom_focal_loss_fn(start_logits, end_logits, start_positions, end_positions, gamma_value):
    
    #calculate Probabilities by applying Softmax to the Start and End Logits. Then get 1 - probabilities
    softmax_func = nn.Softmax(dim=1)
    start_probs = softmax_func(start_logits)
    inv_start_probs = 1 - start_probs
    end_probs = softmax_func(end_logits)
    inv_end_probs = 1 - end_probs
    
    #get log of probabilities. Note: NLLLoss required log probabilities. This is the Natural Log (Log base e)
    log_softmax_func = nn.LogSoftmax(dim=1)
    start_log_probs = log_softmax_func(start_logits)
    end_log_probs = log_softmax_func(end_logits)
    
    nll = nn.NLLLoss()
    
    focal_start_loss = nll(torch.pow(inv_start_probs, gamma_value)* start_log_probs, start_positions)
    focal_end_loss = nll(torch.pow(inv_end_probs, gamma_value)*end_log_probs, end_positions)
    
    #return mean of the Loss for the start and end logits
    return ((focal_start_loss + focal_end_loss)/2)
optim = AdamW(model.parameters(), lr=2e-5, weight_decay=2e-2)
EPOCHS = 3
total_steps=len(train_dataset)*EPOCHS
scheduler=transformers.get_linear_schedule_with_warmup(optim,num_warmup_steps=0,num_training_steps=total_steps )
total_acc = []
total_loss = []



In [25]:
def run_epoch(model, dataloader, epoch):
    model = model.train()
    losses = []
    acc = []
    ctr = 0
    batch_counter = 0
    for batch in tqdm(dataloader, desc = 'Running Epoch '):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        start_logits, end_logits = model(input_ids=input_ids, 
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
        #loss = custom_loss_fn(start_logits, end_logits, start_positions, end_positions)  # <---BASELINE.  Cross Entropy Loss is returned by Default
        loss = custom_focal_loss_fn(start_logits, end_logits, start_positions, end_positions,1) #using gamma_value = 1
        losses.append(loss.item())
        loss.backward()
        optim.step()
        
        start_predictions = torch.argmax(start_logits, dim=1)
        end_predictions = torch.argmax(end_logits, dim=1)
            
        acc.append(((start_predictions == start_positions).sum()/len(start_predictions)).item())
        acc.append(((end_predictions == end_positions).sum()/len(end_predictions)).item())
        #ctr = ctr +1
        #if ctr==50:
        #    break
        batch_counter = batch_counter + 1
        if batch_counter==250 and epoch==1:
            total_acc.append(sum(acc))
            loss_avg = sum(losses)/len(losses)
            total_loss.append(loss_avg)
            batch_counter = 0
    scheduler.step()
    average_accuracy = sum(acc)/len(acc)
    average_loss = sum(losses)/len(losses)
    return(average_accuracy, average_loss)

In [28]:
from sklearn.metrics import f1_score

def eval_model(model, dataloader):
    model = model.eval()
    acc = []
    f1_scores = []
    answer_list = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Running Evaluation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)
            
            start_logits, end_logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            start_predictions = torch.argmax(start_logits, dim=1).item()  # Convert to scalar
            end_predictions = torch.argmax(end_logits, dim=1).item()      # Convert to scalar
            
            # Ensure valid slicing by adjusting end_predictions if it's before start_predictions
            if end_predictions < start_predictions:
                end_predictions = start_predictions

            # Collecting the answers
            answer = tokenizerFast.convert_tokens_to_string(
                tokenizerFast.convert_ids_to_tokens(input_ids[0][start_predictions:end_predictions + 1])
            )
            tanswer = tokenizerFast.convert_tokens_to_string(
                tokenizerFast.convert_ids_to_tokens(input_ids[0][start_true[0].item():end_true[0].item() + 1])
            )
            answer_list.append([answer, tanswer])
            
            # Calculate accuracy
            acc.append((start_predictions == start_true[0].item()) and (end_predictions == end_true[0].item()))
            
            # Calculate F1 score for this batch
            pred_span = set(range(start_predictions, end_predictions + 1))
            true_span = set(range(start_true[0].item(), end_true[0].item() + 1))
            overlap = len(pred_span & true_span)
            
            if len(pred_span) > 0 and len(true_span) > 0:
                precision = overlap / len(pred_span)
                recall = overlap / len(true_span)
                if precision + recall > 0:
                    f1 = 2 * (precision * recall) / (precision + recall)
                else:
                    f1 = 0
            else:
                f1 = 0

            f1_scores.append(f1)
    
    average_accuracy = sum(acc) / len(acc)
    average_f1 = sum(f1_scores) / len(f1_scores)
    print(f"Evaluation Accuracy: {average_accuracy}, F1 Score: {average_f1}")
    
    return answer_list, average_f1

# Main training loop with F1 score printing
wer_list = []
f1_list = []

for epoch in range(EPOCHS):
    train_acc, train_loss = run_epoch(model, train_data_loader, epoch + 1)
    print(f"Train Accuracy: {train_acc}      Train Loss: {train_loss}")
    
    answer_list, avg_f1 = eval_model(model, valid_data_loader)
    
    pred_answers = []
    true_answers = []
    for i in range(len(answer_list)):
        if len(answer_list[i][0]) == 0:
            answer_list[i][0] = "$"
        if len(answer_list[i][1]) == 0:
            answer_list[i][1] = "$"
        pred_answers.append(answer_list[i][0])
        true_answers.append(answer_list[i][1])
    
    wer_score = wer.compute(predictions=pred_answers, references=true_answers)
    print(f"Epoch {epoch}: WER Score: {wer_score}, F1 Score: {avg_f1}")
    
    wer_list.append(wer_score)
    f1_list.append(avg_f1)

# Save WER and F1 scores
with open("base_model_metrics.txt", 'w') as f:
    for wer, f1 in zip(wer_list, f1_list):
        f.write(f"WER: {wer}, F1: {f1}\n")

print("WER Scores:", wer_list)
print("F1 Scores:", f1_list)


Running Epoch : 100%|██████████| 4639/4639 [04:49<00:00, 16.00it/s]


Train Accuracy: 0.6905506112807319      Train Loss: 0.7980141290131251


Running Evaluation: 100%|██████████| 15875/15875 [02:33<00:00, 103.54it/s]


Evaluation Accuracy: 0.4409448818897638, F1 Score: 0.49185593482532547
Epoch 0: WER Score: 2.480515999329871, F1 Score: 0.49185593482532547


Running Epoch : 100%|██████████| 4639/4639 [04:51<00:00, 15.90it/s]


Train Accuracy: 0.8036580390009188      Train Loss: 0.4405247646554185


Running Evaluation: 100%|██████████| 15875/15875 [02:32<00:00, 103.94it/s]


Evaluation Accuracy: 0.43968503937007875, F1 Score: 0.48695535145640984
Epoch 1: WER Score: 2.5994303903501423, F1 Score: 0.48695535145640984


Running Epoch : 100%|██████████| 4639/4639 [04:51<00:00, 15.93it/s]


Train Accuracy: 0.8732620176762234      Train Loss: 0.25790816928034627


Running Evaluation: 100%|██████████| 15875/15875 [02:31<00:00, 104.49it/s]


Evaluation Accuracy: 0.4324409448818898, F1 Score: 0.48302312366909295
Epoch 2: WER Score: 2.160931479309767, F1 Score: 0.48302312366909295
WER Scores: [2.480515999329871, 2.5994303903501423, 2.160931479309767]
F1 Scores: [0.49185593482532547, 0.48695535145640984, 0.48302312366909295]
