In [None]:
# download
!mkdir data
!wget -P data/ https://s3.amazonaws.com/my89public/quac/train_v0.2.json --no-check-certificate
!wget -P data/ https://s3.amazonaws.com/my89public/quac/val_v0.2.json --no-check-certificate

# preprocessing
!python toolbox/download_process_quac.py --quac_file data/train_v0.2.json --output_file data/quac_train.json
!python toolbox/download_process_quac.py --quac_file data/val_v0.2.json --output_file data/quac_dev.json

In [1]:
import pandas as pd
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

import torch.nn as nn
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, DataCollatorWithPadding
from transformers import BertTokenizerFast, LongformerTokenizerFast
from transformers import BertForQuestionAnswering, LongformerForQuestionAnswering

In [2]:
def add_end_pos(answers, docs):
    for answer, context in zip(answers, docs):
      end_pos = answer['answer_start'] + len(answer['text'])
      if context[answer['answer_start']:end_pos] == answer['text']:
        answer['answer_end'] = end_pos
      else:
        Exception('error..')
    return answers

def extract_info(data):
    questions, docs, answers, answer_candidates = [], [], [], []
    ids, is_impossible, yesno, followups = [], [], [], []

    for dialog in data:
        assert len(dialog)==1
        assert len(dialog['paragraphs'])==1
        assert len(dialog['paragraphs'][0]['qas'])==1

        # document/context
        doc = dialog['paragraphs'][0]['context']
        docs.append(doc)

        #others
        qas = dialog['paragraphs'][0]['qas'][0]
        ids.append(qas['id'])
        questions.append(qas['question'])
        answers.append(qas['answers'][0])
        is_impossible.append(qas['is_impossible'])
        yesno.append(qas['yesno'])
        followups.append(qas['followup'])
        answer_candidates.append(qas['answer_candidates'])

    answers = add_end_pos(answers, docs)
    return {'questions':questions, 'docs':docs, 'answers':answers, 'ids':ids, 'is_impossible':is_impossible, 'yesno':yesno, 'followups':followups, 'answer_candidates':answer_candidates}


def load_process_data(train_dir, val_dir):
    train = pd.read_json(train_dir)['data']
    val = pd.read_json(val_dir)['data']

    train = extract_info(train)
    val = extract_info(val)

    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)

    dataset = DatasetDict({'train': train, 'validation': val})

    return dataset

In [3]:
def add_token_positions(val, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # val.char_to_token(i, answers[i]['answer_start']) is the start_pos and it can be none
        start_positions.append(val.char_to_token(i, answers[i]['answer_start'], sequence_index=1))

        #this should not exist
        if(answers[i]['answer_end']==0):
            Exception('error...')
            #end_positions.append(val.char_to_token(i, answers[i]['answer_end'])) 
        else:
            end_positions.append(val.char_to_token(i, answers[i]['answer_end'] - 1, sequence_index=1))

         # if None, the answer passage has been truncated
         # Here is not a good approach
        if start_positions[-1] is None:
            print('start_positions[-1] is None')
            start_positions[-1] = tokenizer.model_max_length

        if end_positions[-1] is None:
            print('end_positions[-1] is None')
            end_positions[-1] = tokenizer.model_max_length

    return start_positions, end_positions

def encode(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    encoding = tokenizer(examples["questions"], examples["docs"], truncation=True, padding="max_length",
                     max_length=512, return_special_tokens_mask=True)

    start_positions, end_positions = add_token_positions(encoding, examples["answers"])
    encoding.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encoding


In [4]:
train_dir = 'data/quac_train.json' 
val_dir = 'data/quac_dev.json' 
dataset = load_process_data(train_dir, val_dir)

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', cache_dir="bert_base/")
#tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', cache_dir="longformer/")

In [None]:
train =  dataset["train"].map(encode, batched=True)
val =  dataset["validation"].map(encode, batched=True)

In [7]:
def prepare_model(trainSet,valSet,tokenizer):

    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', cache_dir="bert_base/")
    #model = LongformerForQuestionAnswering.from_pretrained('allenai/longformer-base-4096', cache_dir="longformer/")
    model.cuda()

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print('preparing model...')

    args = TrainingArguments(
        "bert-base-cased-finetuned-quac",
        evaluation_strategy = "steps",
        eval_steps=10,
        save_strategy = "steps",
        learning_rate=2e-5,
        adafactor=True,
        per_device_train_batch_size=32,
        #gradient_accumulation_steps=4,
        per_device_eval_batch_size=20,
        logging_steps = 50,
        num_train_epochs=50,
        group_by_length=True,
        weight_decay=0.01,
        fp16=True,
        #warmup_ratio=0.02,
        save_total_limit = 3,
        load_best_model_at_end=True,
        report_to="wandb",
        run_name="bert-base-cased-finetuned-quac",
      )

    trainer_quac = Trainer(
        model=model,
        args=args,
        train_dataset=trainSet,
        eval_dataset=valSet,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
      )

    print('ready to train!')
    return trainer_quac

In [8]:
trainer_quac = prepare_model(train,val,tokenizer)

trainer_quac.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

preparing model...


Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates. If questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.


ready to train!


***** Running training *****
  Num examples = 83568
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 130600
  Number of trainable parameters = 108893186
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtan3[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/130600 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates. If questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7354
  Batch size = 20


  0%|          | 0/368 [00:00<?, ?it/s]

{'eval_loss': 5.388714790344238, 'eval_runtime': 25.3004, 'eval_samples_per_second': 290.668, 'eval_steps_per_second': 14.545, 'epoch': 0.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates. If questions, followups, ids, answers, is_impossible, yesno, special_tokens_mask, docs, answer_candidates are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7354
  Batch size = 20


  0%|          | 0/368 [00:00<?, ?it/s]