In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

In [2]:
import collections
import evaluate

import pandas as pd
import numpy as np
import torch

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, create_optimizer
from transformers.keras_callbacks import PushToHubCallback
from transformers import TrainingArguments, Trainer

#from google.colab import drive
#drive.mount('/content/drive')

from huggingface_hub import notebook_login
notebook_login() # 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

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load Data and remove external Question Answer pairs

In [3]:
df = pd.read_excel('/content/WOT_internal_refined_4.xlsx')

df['comment'] = df['comment'].fillna('')

# remove external Question pairs
df['not external'] = df['comment'].apply(lambda x: 'external' not in x.split(' + '))
df = df[df['not external']]

df = df[['domain', 'data_split', 'question', 'history', 'Context', 'answer_extr', 'answer_start', 'answer_end', 'number_answer_elements', 'comment']].rename(columns={'Context': 'context', 'answer_extr': 'answers'})

df.to_csv('/content/temp.csv')
df = pd.read_csv('/content/temp.csv').rename(columns={'Unnamed: 0': 'id'})
df['id'] = df['id'].astype(str)

df['answers'] = df['answers'].apply(lambda x: x[2:-2].split("\', \'"))

df['answer_start'] = df['answer_start'].apply(lambda x: x[1:-1].split(", "))
df['answer_start'] = df['answer_start'].apply(lambda x: [-1] if x[0]=='' else x)
df['answer_start'] = df['answer_start'].apply(lambda x: [int(el) for el in x])

df['answer_end'] = df['answer_end'].apply(lambda x: x[1:-1].split(", "))
df['answer_end'] = df['answer_end'].apply(lambda x: [-1] if x[0]=='' else x)
df['answer_end'] = df['answer_end'].apply(lambda x: [int(el) for el in x])

df['answers'] = df.apply(lambda row: {'text': row['answers'], 'answer_start': row['answer_start']} , axis=1)

## Define Contexts to feed to the model


In [4]:
df['context'] = df.apply(lambda row: f"{row['context']}\nHistory: {row['history']}\n", axis=1)

## One span only

In [5]:
df_1_span = df[df['number_answer_elements'] == 1]

ds_1_span_train = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='train'].reset_index())
ds_1_span_test = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='test'].reset_index())
ds_1_span_val = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='validation'].reset_index())

ds_1_span = DatasetDict({'train': ds_1_span_train,
                         'test': ds_1_span_test,
                         'validation': ds_1_span_val})
print(ds_1_span)

DatasetDict({
    train: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'answer_end', 'number_answer_elements', 'comment'],
        num_rows: 661
    })
    test: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'answer_end', 'number_answer_elements', 'comment'],
        num_rows: 64
    })
    validation: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'answer_end', 'number_answer_elements', 'comment'],
        num_rows: 72
    })
})


In [6]:
df_1_span['control'] = df_1_span.apply(lambda row: row['context'][row['answer_start'][0]:row['answer_end'][0]], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1_span['control'] = df_1_span.apply(lambda row: row['context'][row['answer_start'][0]:row['answer_end'][0]], axis=1)


In [7]:
model_checkpoint = "distilbert-base-cased-distilled-squad" #"bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

### Prepare Training Data

In [8]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start a--nd end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
train_dataset = ds_1_span["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=ds_1_span["train"].column_names,
)
len(ds_1_span["train"]), len(train_dataset)

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

(661, 3110)

### Prepare Validation Data

In [10]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [11]:
validation_dataset = ds_1_span["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds_1_span["validation"].column_names,
)

test_dataset = ds_1_span["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds_1_span["test"].column_names,
)

train_test_dataset = ds_1_span["train"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds_1_span["train"].column_names,
)

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

### Evaluation Function

In [12]:
from tqdm.auto import tqdm

n_best = 20
max_answer_length = 128
predicted_answers = []
metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    #return [predicted_answers, theoretical_answers]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

### Not Tuned

In [13]:
small_eval_set = ds_1_span["test"]
trained_checkpoint = "distilbert-base-cased-distilled-squad" #"bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds_1_span["test"].column_names,
)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

compute_metrics(start_logits, end_logits, eval_set, ds_1_span['test'])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/261M [00:00<?, ?B/s]

  0%|          | 0/64 [00:00<?, ?it/s]

{'exact_match': 3.125, 'f1': 16.825755767918658}

### Fine Tune

In [37]:
per_device_eval_batch_size = 32     #64
per_device_train_batch_size = 16     #16
gradient_accumulation_steps = 2 

data_collator = DefaultDataCollator(return_tensors="pt")
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

args = TrainingArguments(
    "bert-finetuned-WoT",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/a8440a9ab1b3d7a603df3349516078434abade1e/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/a8440a9ab1b3d7a603df3349516078434abade1e/vocab.txt
loading file tokenizer.json from cac

Epoch,Training Loss,Validation Loss
0,No log,No log
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 289
  Batch size = 32
Saving model checkpoint to bert-finetuned-WoT/checkpoint-97
Configuration saved in bert-finetuned-WoT/checkpoint-97/config.json
Model weights saved in bert-finetuned-WoT/checkpoint-97/pytorch_model.bin
tokenizer config file saved in bert-finetuned-WoT/checkpoint-97/tokenizer_config.json
Special tokens file saved in bert-finetuned-WoT/checkpoint-97/special_tokens_map.json
tokenizer config file saved in bert-finetuned-WoT/tokenizer_config.json
Special tokens file saved in bert-finetuned-WoT/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForQuestionAns

TrainOutput(global_step=485, training_loss=0.6749360074701998, metrics={'train_runtime': 407.293, 'train_samples_per_second': 38.179, 'train_steps_per_second': 1.191, 'total_flos': 1523153840173056.0, 'train_loss': 0.6749360074701998, 'epoch': 4.99})

In [38]:
trainer.push_to_hub()

Saving model checkpoint to bert-finetuned-WoT
Configuration saved in bert-finetuned-WoT/config.json
Model weights saved in bert-finetuned-WoT/pytorch_model.bin
tokenizer config file saved in bert-finetuned-WoT/tokenizer_config.json
Special tokens file saved in bert-finetuned-WoT/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file runs/Feb27_23-29-57_512893f5f46d/events.out.tfevents.1677540603.512893f5f46d.1086.14: 100%|#######…

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/Niklas25/bert-finetuned-WoT
   d1bfc4e..d715dbc  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Niklas25/bert-finetuned-WoT
   d1bfc4e..d715dbc  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}}
To https://huggingface.co/Niklas25/bert-finetuned-WoT
   d715dbc..a11b924  main -> main

   d715dbc..a11b924  main -> main



'https://huggingface.co/Niklas25/bert-finetuned-WoT/commit/d715dbccb2ac919f701e48acc940572f608104bc'

In [39]:
predictions, _, _ = trainer.predict(test_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, test_dataset, ds_1_span["test"])

The following columns in the test set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 296
  Batch size = 32


  0%|          | 0/64 [00:00<?, ?it/s]

{'exact_match': 10.9375, 'f1': 26.272160858801136}