In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, Trainer, TrainingArguments, set_seed
from datasets import *
import numpy as np
import torch
import ast

torch.cuda.empty_cache()

In [2]:
SEED = 42
set_seed(SEED)

# any combination of these years and dataset types can be used
# year = 2020
year = 2022
# dataset_type = "full"
dataset_type = "smaller"

# only combination of these years and dataset types can be used
# year = 2042 # idk some random number for file names
# dataset_type = "full_combined"
# dataset_type = "smaller_combined"

# only combination of these years and dataset types can be used
# year = 2022
# dataset_type = "handwritten"

local_models_path = '../../data/models/BERT'

#model_name = 'distilbert-base-cased-distilled-squad'
model_name = "deepset/roberta-base-squad2"

### Load tokenizer and model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

### Load the dataset

In [4]:
# Load the dataset from file and split it into train and test datasets
if dataset_type == "full":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
elif dataset_type == "smaller":
    data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{year}-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
    data["train"] = data["train"].select(range(len(data["train"]) // 2))
elif dataset_type == "full-combined":
    data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                        delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)
elif dataset_type == "smaller-combined":
    data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2042-squad-format.csv",
                        delimiter=";", split="train").train_test_split(test_size=0.3, shuffle=True, seed=SEED)
    data["train"] = data["train"].select(range(len(data["train"]) // 2))
elif dataset_type == "handwritten":
    data = load_dataset('csv', data_files=f"../../data/clean/QA_SR_2022_Expert-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)
else:
    raise Exception("Invalid dataset type")

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-a8af1b4c8d81fb1c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-7d8841dd72615495.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8b1142ae44904f4e.arrow


In [5]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data["test"] = data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
# replace all "\n" with " " in the context, answers and questions
data["train"] = data["train"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-36096c12f970e2cd.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b621ad6a176502d0.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c2ee3b08607cceb8.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-53601432e8341005.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

In [6]:
def tokenize_sample_data(data):
    # Tokenize the data
    tokenized_feature = tokenizer(
        data["question"],
        data["context"],
        max_length=384,
        return_overflowing_tokens=True,
        stride=128,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    # When it overflows, multiple rows will be returned for a single example.
    # The following then gets the array of corresponding the original sample index.
    sample_mapping = tokenized_feature.pop("overflow_to_sample_mapping")
    # Get the array of [start_char, end_char + 1] in each token.
    # The shape is [returned_row_size, max_length]
    offset_mapping = tokenized_feature.pop("offset_mapping")

    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = data["answers"][sample_index]
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0]) - 1
        # The format of sequence_ids is [None, 0, ..., 0, None, None, 1, ..., 1, None, None, ...]
        # in which question's token is 0 and contex's token is 1
        sequence_ids = tokenized_feature.sequence_ids(i)
        # find the start and end index of context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
        # Set start positions and end positions in inputs_ids
        # Note: The second element in offset is end_char + 1
        # if offset[context_start][0] > end_char or offset[context_end][1] <= start_char:
        if not (offset[context_start][0] <= start_char and end_char < offset[context_end][1]):
            # The case that answer is not inside the context
            # Note : Some tokenizer (such as, tokenizer in rinna model) doesn't place CLS
            # for the first token in sequence, and I then set -1 as positions.
            # (Later I'll process rows with start_positions=-1.)
            start_positions.append(-1)
            end_positions.append(-1)
        else:
            # The case that answer is found in the context

            # Set start position
            idx = context_start
            while offset[idx][0] < start_char:
                idx += 1
            if offset[idx][0] == start_char:
                start_positions.append(idx)
            else:
                start_positions.append(idx - 1)

            # Set end position
            idx = context_end
            while offset[idx][1] > end_char + 1:
                idx -= 1
            if offset[idx][1] == end_char + 1:
                end_positions.append(idx)
            else:
                end_positions.append(idx + 1)

    # Build result
    tokenized_feature["start_positions"] = start_positions
    tokenized_feature["end_positions"] = end_positions
    return tokenized_feature


# Run conversion
if dataset_type == "handwritten":
    remove_columns = ["context", "question", "answers"]
else:
    remove_columns = ["id", "context", "question", "answers"]
tokenized_ds = data.map(
    tokenize_sample_data,
    remove_columns=remove_columns,
    batched=True,
    batch_size=128)

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-865809dec1721449.arrow


In [7]:
name = model_name.split("/")[-1]
output_dir = f"{local_models_path}/{name}-finetuned-NLB-QA-{year}-{dataset_type}"

# Set the data collator
data_collator = DefaultDataCollator()

# Define the training arguments
if "roberta" in model_name:
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
elif "distilbert" in model_name:
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=25,
        fp16=True,
        push_to_hub=False,
        load_best_model_at_end=True
    )
else:
    raise ValueError("Model name not supported")

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
)

In [8]:
trainer.train()
trainer.save_model(output_dir)



  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.36988285183906555, 'eval_runtime': 0.6054, 'eval_samples_per_second': 176.753, 'eval_steps_per_second': 23.127, 'epoch': 1.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.36255428194999695, 'eval_runtime': 0.586, 'eval_samples_per_second': 182.605, 'eval_steps_per_second': 23.892, 'epoch': 2.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.3544045090675354, 'eval_runtime': 0.59, 'eval_samples_per_second': 181.342, 'eval_steps_per_second': 23.727, 'epoch': 3.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.4069691002368927, 'eval_runtime': 0.5772, 'eval_samples_per_second': 185.378, 'eval_steps_per_second': 24.255, 'epoch': 4.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.455201655626297, 'eval_runtime': 0.5825, 'eval_samples_per_second': 183.7, 'eval_steps_per_second': 24.036, 'epoch': 5.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5123716592788696, 'eval_runtime': 0.611, 'eval_samples_per_second': 175.134, 'eval_steps_per_second': 22.915, 'epoch': 6.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5021241903305054, 'eval_runtime': 0.5515, 'eval_samples_per_second': 194.003, 'eval_steps_per_second': 25.384, 'epoch': 7.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5525029897689819, 'eval_runtime': 0.604, 'eval_samples_per_second': 177.16, 'eval_steps_per_second': 23.18, 'epoch': 8.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5368020534515381, 'eval_runtime': 0.5712, 'eval_samples_per_second': 187.312, 'eval_steps_per_second': 24.508, 'epoch': 9.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5554725527763367, 'eval_runtime': 0.5734, 'eval_samples_per_second': 186.612, 'eval_steps_per_second': 24.416, 'epoch': 10.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5286325812339783, 'eval_runtime': 0.559, 'eval_samples_per_second': 191.407, 'eval_steps_per_second': 25.044, 'epoch': 11.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5565110445022583, 'eval_runtime': 0.5771, 'eval_samples_per_second': 185.41, 'eval_steps_per_second': 24.259, 'epoch': 12.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5881377458572388, 'eval_runtime': 0.8711, 'eval_samples_per_second': 122.83, 'eval_steps_per_second': 16.071, 'epoch': 13.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5846410393714905, 'eval_runtime': 0.801, 'eval_samples_per_second': 133.583, 'eval_steps_per_second': 17.478, 'epoch': 14.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5345399379730225, 'eval_runtime': 0.9946, 'eval_samples_per_second': 107.585, 'eval_steps_per_second': 14.077, 'epoch': 15.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5486778020858765, 'eval_runtime': 0.853, 'eval_samples_per_second': 125.439, 'eval_steps_per_second': 16.413, 'epoch': 16.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.6030770540237427, 'eval_runtime': 0.8116, 'eval_samples_per_second': 131.835, 'eval_steps_per_second': 17.25, 'epoch': 17.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5826606154441833, 'eval_runtime': 0.8536, 'eval_samples_per_second': 125.349, 'eval_steps_per_second': 16.401, 'epoch': 18.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5690815448760986, 'eval_runtime': 0.8159, 'eval_samples_per_second': 131.138, 'eval_steps_per_second': 17.158, 'epoch': 19.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5517743229866028, 'eval_runtime': 0.8179, 'eval_samples_per_second': 130.825, 'eval_steps_per_second': 17.117, 'epoch': 20.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5442247986793518, 'eval_runtime': 0.8145, 'eval_samples_per_second': 131.376, 'eval_steps_per_second': 17.189, 'epoch': 21.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5446170568466187, 'eval_runtime': 1.1067, 'eval_samples_per_second': 96.687, 'eval_steps_per_second': 12.651, 'epoch': 22.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5450494289398193, 'eval_runtime': 0.8261, 'eval_samples_per_second': 129.518, 'eval_steps_per_second': 16.946, 'epoch': 23.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5432375073432922, 'eval_runtime': 0.9543, 'eval_samples_per_second': 112.122, 'eval_steps_per_second': 14.67, 'epoch': 24.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5431069135665894, 'eval_runtime': 0.9449, 'eval_samples_per_second': 113.234, 'eval_steps_per_second': 14.816, 'epoch': 25.0}
{'train_runtime': 529.9185, 'train_samples_per_second': 5.85, 'train_steps_per_second': 0.755, 'train_loss': 0.036245877742767336, 'epoch': 25.0}
