In [1]:
# Tokenizer imports
from transformers import DistilBertTokenizerFast

# Data handling imports
from datasets import *
import numpy as np

# General imports
import torch
import ast

year = 2022

In [2]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-e3048f1bd60b5c4e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e96c5bd318352c3f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3f782815aab69336.arrow


In [3]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0e58ef57ff25bb58.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-944b27bfd97247d4.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-d92cbfe0185ed3c0.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e00fcfca42a8136e.arrow


Dataset({
    features: ['index', 'question', 'context', 'answers', 'id'],
    num_rows: 248
})

In [4]:
# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-cased-distilled-squad')

In [5]:
def tokenize_sample_data(data):
    # Tokenize the data
    tokenized_feature = tokenizer(
        data["question"],
        data["context"],
        max_length=384,
        return_overflowing_tokens=True,
        stride=128,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    # When it overflows, multiple rows will be returned for a single example.
    # The following then gets the array of corresponding the original sample index.
    sample_mapping = tokenized_feature.pop("overflow_to_sample_mapping")
    # Get the array of [start_char, end_char + 1] in each token.
    # The shape is [returned_row_size, max_length]
    offset_mapping = tokenized_feature.pop("offset_mapping")

    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = data["answers"][sample_index]
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0]) - 1
        # The format of sequence_ids is [None, 0, ..., 0, None, None, 1, ..., 1, None, None, ...]
        # in which question's token is 0 and contex's token is 1
        sequence_ids = tokenized_feature.sequence_ids(i)
        # find the start and end index of context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
        # Set start positions and end positions in inputs_ids
        # Note: The second element in offset is end_char + 1
        # if offset[context_start][0] > end_char or offset[context_end][1] <= start_char:
        if not (offset[context_start][0] <= start_char and end_char < offset[context_end][1]):
            # The case that answer is not inside the context
            # Note : Some tokenizer (such as, tokenizer in rinna model) doesn't place CLS
            # for the first token in sequence, and I then set -1 as positions.
            # (Later I'll process rows with start_positions=-1.)
            start_positions.append(-1)
            end_positions.append(-1)
        else:
            # The case that answer is found in the context

            # Set start position
            idx = context_start
            while offset[idx][0] < start_char:
                idx += 1
            if offset[idx][0] == start_char:
                start_positions.append(idx)
            else:
                start_positions.append(idx - 1)

            # Set end position
            idx = context_end
            while offset[idx][1] > end_char + 1:
                idx -= 1
            if offset[idx][1] == end_char + 1:
                end_positions.append(idx)
            else:
                end_positions.append(idx + 1)

    # Build result
    tokenized_feature["start_positions"] = start_positions
    tokenized_feature["end_positions"] = end_positions
    return tokenized_feature


# Run conversion
tokenized_ds = data.map(
    tokenize_sample_data,
    remove_columns=["id", "context", "question", "answers"],
    batched=True,
    batch_size=128)

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-1e6f806c24a84f21.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-31a7f2451f59b3da.arrow


In [6]:
# Remove rows with no answer
tokenized_ds = tokenized_ds.filter(lambda x: x["start_positions"] != -1)

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-026b8683597cb820.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-57438fb66da9df89.arrow


In [7]:
# QA model import
from transformers import AutoModelForQuestionAnswering

# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained(
    "distilbert-base-cased-distilled-squad").to(device)

In [8]:
# Optimizer import
from transformers import DefaultDataCollator

# Set the data collator
data_collator = DefaultDataCollator()

In [9]:
# Trainer import
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir=f"distilbert-nlb-qa-{year}",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    learning_rate=2e-6,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=True,
    push_to_hub=False
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
)

In [10]:
# Fine-tune the model
trainer.train()



  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8037017583847046, 'eval_runtime': 0.3056, 'eval_samples_per_second': 350.097, 'eval_steps_per_second': 22.904, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7687519192695618, 'eval_runtime': 0.282, 'eval_samples_per_second': 379.433, 'eval_steps_per_second': 24.823, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.742975652217865, 'eval_runtime': 0.2773, 'eval_samples_per_second': 385.899, 'eval_steps_per_second': 25.246, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7246463298797607, 'eval_runtime': 0.273, 'eval_samples_per_second': 391.944, 'eval_steps_per_second': 25.641, 'epoch': 4.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7081414461135864, 'eval_runtime': 0.2736, 'eval_samples_per_second': 391.079, 'eval_steps_per_second': 25.585, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6970767378807068, 'eval_runtime': 0.2715, 'eval_samples_per_second': 394.117, 'eval_steps_per_second': 25.783, 'epoch': 6.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.689233660697937, 'eval_runtime': 0.2739, 'eval_samples_per_second': 390.673, 'eval_steps_per_second': 25.558, 'epoch': 7.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6834876537322998, 'eval_runtime': 0.273, 'eval_samples_per_second': 391.941, 'eval_steps_per_second': 25.641, 'epoch': 8.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6807333827018738, 'eval_runtime': 0.274, 'eval_samples_per_second': 390.516, 'eval_steps_per_second': 25.548, 'epoch': 9.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6793057322502136, 'eval_runtime': 0.274, 'eval_samples_per_second': 390.514, 'eval_steps_per_second': 25.548, 'epoch': 10.0}
{'train_runtime': 26.0803, 'train_samples_per_second': 95.091, 'train_steps_per_second': 6.135, 'train_loss': 0.6065438747406006, 'epoch': 10.0}


TrainOutput(global_step=160, training_loss=0.6065438747406006, metrics={'train_runtime': 26.0803, 'train_samples_per_second': 95.091, 'train_steps_per_second': 6.135, 'train_loss': 0.6065438747406006, 'epoch': 10.0})

In [11]:
# Define the prediction function
def inference_answer(model, question, context):
    question = question
    context = context
    test_feature = tokenizer(
        question,
        context,
        max_length=318
    )
    with torch.no_grad():
        outputs = model(torch.tensor([test_feature["input_ids"]]).to(device))
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()
    answer_ids = test_feature["input_ids"][np.argmax(
        start_logits):np.argmax(end_logits)+1]
    return " ".join(tokenizer.batch_decode(answer_ids))

In [12]:
# Generate an array of predictions and an array of true answers
answer_pred = [inference_answer(model, data["test"]["question"][idx],
                                data["test"]["context"][idx]) for idx in range(data["test"].shape[0])]
answer_true = [data["test"]["answers"][idx]["text"][0]
               for idx in range(data["test"].shape[0])]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
# Importing the evaluation library
import evaluate
bertscore = evaluate.load("bertscore")

results = bertscore.compute(predictions=answer_pred,
                            references=answer_true, lang="en")
# Embeddings bases evaluation
print(
    f"F1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

F1: 0.8959363570837217, Precision: 0.8822237264330142, Recall: 0.9181021975579663




In [14]:
# SQuAD v2.0 evaluation
squad_v2_metric = evaluate.load("squad_v2")
references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in zip(data["test"]["id"], data["test"]["answers"])]
predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(data["test"]["id"], answer_pred)]
results = squad_v2_metric.compute(
    predictions=predictions, references=references)
results

{'exact': 34.57943925233645,
 'f1': 48.64069612291783,
 'total': 107,
 'HasAns_exact': 34.57943925233645,
 'HasAns_f1': 48.64069612291783,
 'HasAns_total': 107,
 'best_exact': 34.57943925233645,
 'best_exact_thresh': 0.0,
 'best_f1': 48.64069612291783,
 'best_f1_thresh': 0.0}

In [15]:
bleu = evaluate.load("bleu")
references = [[answer] for answer in answer_true]
predictions = answer_pred
# N-Gram based evaluation
results = bleu.compute(predictions=predictions, references=references)
results

{'bleu': 0.1360610522652627,
 'precisions': [0.2412831241283124,
  0.15548281505728315,
  0.10775047258979206,
  0.08478260869565217],
 'brevity_penalty': 1.0,
 'length_ratio': 2.597826086956522,
 'translation_length': 717,
 'reference_length': 276}

In [16]:
# Load the same model without fine-tuning on our dataset
model_no_ft = AutoModelForQuestionAnswering.from_pretrained(
    "distilbert-base-cased-distilled-squad").to(device)

In [17]:
# Generate another set of predictions
answer_pred_no_ft = [inference_answer(model_no_ft, data["test"]["question"]
                                      [idx], data["test"]["context"][idx]) for idx in range(data["test"].shape[0])]

In [18]:
results = bertscore.compute(
    predictions=answer_pred_no_ft, references=answer_true, lang="en")
# Embeddings bases evaluation
print(
    f"F1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

F1: 0.8843503432853199, Precision: 0.869496658026615, Recall: 0.915677343016473




In [19]:
references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in zip(data["test"]["id"], data["test"]["answers"])]
predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(data["test"]["id"], answer_pred_no_ft)]
# SQuAD v2.0 evaluation
results = squad_v2_metric.compute(
    predictions=predictions, references=references)
results

{'exact': 31.77570093457944,
 'f1': 46.81653897165788,
 'total': 107,
 'HasAns_exact': 31.77570093457944,
 'HasAns_f1': 46.81653897165788,
 'HasAns_total': 107,
 'best_exact': 31.77570093457944,
 'best_exact_thresh': 0.0,
 'best_f1': 46.81653897165788,
 'best_f1_thresh': 0.0}

In [20]:
references = [[answer] for answer in answer_true]
predictions = answer_pred_no_ft
# N-Gram based evaluation
results = bleu.compute(predictions=predictions, references=references)
results

{'bleu': 0.10917104830220783,
 'precisions': [0.22283356258596973,
  0.13344051446945338,
  0.08333333333333333,
  0.05732484076433121],
 'brevity_penalty': 1.0,
 'length_ratio': 2.6340579710144927,
 'translation_length': 727,
 'reference_length': 276}

In [21]:
# Locally save the model
trainer.save_model(f"./distilbert-nlb-qa-{year}")