In [1]:
from transformers import DistilBertTokenizerFast
from datasets import *
import torch
import ast
import numpy as np

In [2]:
data = load_dataset('csv', data_files="../data/clean/sustainability-report-2020-squad-format.csv", delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True)

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-6a9a3e730f68f403/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [3]:
data["test"] = data["test"].map(lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["train"].remove_columns(["text", "answer_start"])

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'context', 'answers', 'id'],
    num_rows: 129
})

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

In [5]:
def tokenize_sample_data(data):
  # Tokenize
  tokenized_feature = tokenizer(
    data["question"],
    data["context"],
    max_length = 384,
    return_overflowing_tokens=True,
    stride=128,
    truncation="only_second",
    padding = "max_length",
    return_offsets_mapping=True,
  )

  # When it overflows, multiple rows will be returned for a single example.
  # The following then gets the array of corresponding the original sample index.
  sample_mapping = tokenized_feature.pop("overflow_to_sample_mapping")
  # Get the array of [start_char, end_char + 1] in each token.
  # The shape is [returned_row_size, max_length]
  offset_mapping = tokenized_feature.pop("offset_mapping")

  start_positions = []
  end_positions = []
  for i, offset in enumerate(offset_mapping):
    sample_index = sample_mapping[i]
    answers = data["answers"][sample_index]
    start_char = answers["answer_start"][0]
    end_char = start_char + len(answers["text"][0]) - 1
    # The format of sequence_ids is [None, 0, ..., 0, None, None, 1, ..., 1, None, None, ...]
    # in which question's token is 0 and contex's token is 1
    sequence_ids = tokenized_feature.sequence_ids(i)
    # find the start and end index of context
    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1
    # Set start positions and end positions in inputs_ids
    # Note: The second element in offset is end_char + 1
    # if offset[context_start][0] > end_char or offset[context_end][1] <= start_char:
    if not (offset[context_start][0] <= start_char and end_char < offset[context_end][1]):
      # The case that answer is not inside the context
      ## Note : Some tokenizer (such as, tokenizer in rinna model) doesn't place CLS
      ## for the first token in sequence, and I then set -1 as positions.
      ## (Later I'll process rows with start_positions=-1.)
      start_positions.append(-1)
      end_positions.append(-1)
    else:
      # The case that answer is found in the context

      # Set start position
      idx = context_start
      while offset[idx][0] < start_char:
        idx += 1
      if offset[idx][0] == start_char:
        start_positions.append(idx)
      else:
        start_positions.append(idx - 1)

      # Set end position
      idx = context_end
      while offset[idx][1] > end_char + 1:
        idx -= 1
      if offset[idx][1] == end_char + 1:
        end_positions.append(idx)
      else:
        end_positions.append(idx + 1)

  # Build result
  tokenized_feature["start_positions"] = start_positions
  tokenized_feature["end_positions"] = end_positions   
  return tokenized_feature

# Run conversion
tokenized_ds = data.map(
  tokenize_sample_data,
  remove_columns=["id", "context", "question", "answers"],
  batched=True,
  batch_size=128)

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [6]:
tokenized_ds = tokenized_ds.filter(lambda x: x["start_positions"] != -1)

Filter:   0%|          | 0/129 [00:00<?, ? examples/s]

Filter:   0%|          | 0/56 [00:00<?, ? examples/s]

In [7]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'answer_start', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 129
    })
    test: Dataset({
        features: ['text', 'answer_start', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 56
    })
})

In [15]:
from transformers import AutoModelForQuestionAnswering

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad").to(device)

In [16]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir = "distilbert-nlb-qa",
  log_level = "error",
  num_train_epochs = 10,
  learning_rate = 2e-5,
  lr_scheduler_type = "linear",
  warmup_steps = 2,
  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 16,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 2,
  save_steps = 2,
  logging_steps = 2,
  push_to_hub = False
)

In [18]:
from transformers import Trainer

trainer = Trainer(
  model = model,
  args = training_args,
  data_collator = data_collator,
  train_dataset = tokenized_ds["train"],
  eval_dataset = tokenized_ds["test"],
  tokenizer = tokenizer,
)

trainer.train()



{'loss': 0.4372, 'learning_rate': 2e-05, 'epoch': 1.78}
{'eval_loss': 0.6099743247032166, 'eval_runtime': 0.2752, 'eval_samples_per_second': 203.451, 'eval_steps_per_second': 14.532, 'epoch': 1.78}
{'loss': 0.1798, 'learning_rate': 1.5000000000000002e-05, 'epoch': 3.0}
{'eval_loss': 0.5812585949897766, 'eval_runtime': 0.2742, 'eval_samples_per_second': 204.194, 'eval_steps_per_second': 14.585, 'epoch': 3.0}
{'loss': 0.1233, 'learning_rate': 1e-05, 'epoch': 4.0}
{'eval_loss': 0.5635116696357727, 'eval_runtime': 0.2752, 'eval_samples_per_second': 203.452, 'eval_steps_per_second': 14.532, 'epoch': 4.0}
{'loss': 0.1481, 'learning_rate': 5e-06, 'epoch': 5.33}
{'eval_loss': 0.5557196736335754, 'eval_runtime': 0.2758, 'eval_samples_per_second': 203.041, 'eval_steps_per_second': 14.503, 'epoch': 5.33}
{'loss': 0.1892, 'learning_rate': 0.0, 'epoch': 7.0}
{'eval_loss': 0.5533846616744995, 'eval_runtime': 0.2732, 'eval_samples_per_second': 204.942, 'eval_steps_per_second': 14.639, 'epoch': 7.0}
{

TrainOutput(global_step=10, training_loss=0.21552665084600447, metrics={'train_runtime': 44.7201, 'train_samples_per_second': 28.846, 'train_steps_per_second': 0.224, 'train_loss': 0.21552665084600447, 'epoch': 7.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.5533846616744995, 'eval_runtime': 0.5655, 'eval_samples_per_second': 99.026, 'eval_steps_per_second': 7.073, 'epoch': 7.0}


{'eval_loss': 0.5533846616744995,
 'eval_runtime': 0.5655,
 'eval_samples_per_second': 99.026,
 'eval_steps_per_second': 7.073,
 'epoch': 7.0}

In [20]:
import torch

def inference_answer(model, question, context):
  question = question
  context = context
  test_feature = tokenizer(
    question,
    context,
    max_length=318,
  )
  with torch.no_grad():
    outputs = model(torch.tensor([test_feature["input_ids"]]).to(device))
  start_logits = outputs.start_logits.cpu().numpy()
  end_logits = outputs.end_logits.cpu().numpy()
  answer_ids = test_feature["input_ids"][np.argmax(start_logits):np.argmax(end_logits)+1]
  return " ".join(tokenizer.batch_decode(answer_ids))


In [21]:
answer_pred = [inference_answer(model, data["test"]["question"][idx], data["test"]["context"][idx]) for idx in range(data["test"].shape[0])]
answer_true = [data["test"]["answers"][idx]["text"][0] for idx in range(data["test"].shape[0])]

In [22]:
import evaluate
bertscore = evaluate.load("bertscore")

results = bertscore.compute(predictions=answer_pred, references=answer_true, lang="en")
# Embeddings bases
print(f"F1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

F1: 0.8967132440635136, Precision: 0.8799982922417777, Recall: 0.9153822979756764




In [23]:
squad_v2_metric = evaluate.load("squad_v2")
references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data["test"]["id"], data["test"]["answers"])]
predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data["test"]["id"], answer_pred)]
results = squad_v2_metric.compute(predictions=predictions, references=references)
results

{'exact': 32.142857142857146,
 'f1': 51.01089152075996,
 'total': 56,
 'HasAns_exact': 32.142857142857146,
 'HasAns_f1': 51.01089152075996,
 'HasAns_total': 56,
 'best_exact': 32.142857142857146,
 'best_exact_thresh': 0.0,
 'best_f1': 51.01089152075996,
 'best_f1_thresh': 0.0}

In [35]:
bleu = evaluate.load("bleu")
references = [[answer] for answer in answer_true]
predictions = answer_pred
# N-Gram based
results = bleu.compute(predictions=predictions, references=references)
results

{'bleu': 0.12807288493596483,
 'precisions': [0.2621145374449339,
  0.17293233082706766,
  0.10057471264367816,
  0.05901639344262295],
 'brevity_penalty': 1.0,
 'length_ratio': 2.4148936170212765,
 'translation_length': 454,
 'reference_length': 188}

In [25]:
model_no_ft = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad").to(device)

In [31]:
answer_pred_no_ft = [inference_answer(model_no_ft, data["test"]["question"][idx], data["test"]["context"][idx]) for idx in range(data["test"].shape[0])]

In [32]:
results = bertscore.compute(predictions=answer_pred_no_ft, references=answer_true, lang="en")
# Embeddings bases
print(f"F1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

F1: 0.8821883414472852, Precision: 0.8647412029760224, Recall: 0.9015512753810201




In [33]:
references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [answer["text"][0]]}, "id": str(id)} for id, answer in zip(data["test"]["id"], data["test"]["answers"])]
predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.} for id, answer in zip(data["test"]["id"], answer_pred_no_ft)]
results = squad_v2_metric.compute(predictions=predictions, references=references)
results

{'exact': 30.357142857142858,
 'f1': 51.58764396086644,
 'total': 56,
 'HasAns_exact': 30.357142857142858,
 'HasAns_f1': 51.58764396086644,
 'HasAns_total': 56,
 'best_exact': 30.357142857142858,
 'best_exact_thresh': 0.0,
 'best_f1': 51.58764396086644,
 'best_f1_thresh': 0.0}

In [34]:
references = [[answer] for answer in answer_true]
predictions = answer_pred_no_ft
# N-Gram based
results = bleu.compute(predictions=predictions, references=references)
results

{'bleu': 0.1426860827382191,
 'precisions': [0.27901785714285715,
  0.18781725888324874,
  0.11337209302325581,
  0.06976744186046512],
 'brevity_penalty': 1.0,
 'length_ratio': 2.382978723404255,
 'translation_length': 448,
 'reference_length': 188}