In [None]:
# download
!mkdir data
!wget -P data/ https://s3.amazonaws.com/my89public/quac/train_v0.2.json --no-check-certificate
!wget -P data/ https://s3.amazonaws.com/my89public/quac/val_v0.2.json --no-check-certificate

# preprocessing
!python toolbox/download_process_quac.py --quac_file data/train_v0.2.json --output_file data/quac_train.json
!python toolbox/download_process_quac.py --quac_file data/val_v0.2.json --output_file data/quac_dev.json

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

import torch.nn as nn
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, DataCollatorWithPadding
from transformers import BertTokenizerFast, LongformerTokenizerFast
from transformers import BertForQuestionAnswering, LongformerForQuestionAnswering

In [2]:
def add_end_pos(answers, docs):
    for answer, context in zip(answers, docs):
      end_pos = answer['answer_start'] + len(answer['text'])
      if context[answer['answer_start']:end_pos] == answer['text']:
        answer['answer_end'] = end_pos
      else:
        Exception('error..')
    return answers

def extract_info(data):
    questions, docs, answers, answer_candidates = [], [], [], []
    ids, is_impossible, yesno, followups = [], [], [], []

    for dialog in data:
        assert len(dialog)==1
        assert len(dialog['paragraphs'])==1
        assert len(dialog['paragraphs'][0]['qas'])==1

        # document/context
        doc = dialog['paragraphs'][0]['context']
        docs.append(doc)

        #others
        qas = dialog['paragraphs'][0]['qas'][0]
        ids.append(qas['id'])
        questions.append(qas['question'])
        answers.append(qas['answers'][0])
        is_impossible.append(qas['is_impossible'])
        yesno.append(qas['yesno'])
        followups.append(qas['followup'])
        answer_candidates.append(qas['answer_candidates'])

    answers = add_end_pos(answers, docs)
    return {'questions':questions, 'docs':docs, 'answers':answers, 'ids':ids, 'is_impossible':is_impossible, 'yesno':yesno, 'followups':followups, 'answer_candidates':answer_candidates}

def load_process_data(train_dir, val_dir):
    train = pd.read_json(train_dir)['data']
    val = pd.read_json(val_dir)['data']

    train = extract_info(train)
    val = extract_info(val)
    
    train = process_cannoranswer(train)
    val = process_cannoranswer(val)

    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)
    dataset = DatasetDict({'train': train, 'validation': val})

    return dataset

def process_cannoranswer(dataset):
    dataset['docs'] = [i[:-13] for i in dataset['docs']]
    new = []
    for i in dataset['answers']:
        if i['text']=='CANNOTANSWER':
            new.append({'text': '', 'answer_start': 0, 'answer_end':0})
        else:
            new.append(i)
    dataset['answers'] = new
    return dataset

In [3]:
def add_token_positions(val, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):

        if (answers[i]['answer_end']==0) & (answers[i]['answer_start']==0):
            start_positions.append(0)
            end_positions.append(0)
        else:

            # val.char_to_token(i, answers[i]['answer_start']) is the start_pos and it can be none
            start_positions.append(val.char_to_token(i, answers[i]['answer_start'], sequence_index=1))

            #this alone should not exist
            if(answers[i]['answer_end']==0):
                Exception('error...')
                #end_positions.append(val.char_to_token(i, answers[i]['answer_end'])) 
            else:
                end_positions.append(val.char_to_token(i, answers[i]['answer_end'] - 1, sequence_index=1))

            # if None, the answer passage has been truncated
             # Here is not a good approach
            if start_positions[-1] is None:
#                 print('start_positions[-1] is None')
                start_positions[-1] = 0

            if end_positions[-1] is None:
#                 print('end_positions[-1] is None')
                end_positions[-1] = 0

    return start_positions, end_positions

max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 64 # The authorized overlap between two part of the context when splitting it is needed.

def add_token_positions_chunking(tokenized_examples, all_answers):
    
    start_positions = []
    end_positions = []
    
    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        
        # Label impossible answers with the index of the CLS token (0,0).
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        # id == 0: question, id == 1: context
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = all_answers[sample_index]
        # If no answers are given, set the cls_index as answer.
        # In our case, all questons with CANNOTANSWER return an answer_end of None. Thus all these cases get labeled (0,0) 
        if answers["answer_end"] == None:
            Exception('error...')
        if answers["answer_end"] == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"]
            end_char = answers["answer_end"]

            # Set token index where context starts (sequence_id==1)
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # Move the token_start_index to start_char
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                # Move token_end_index to the end_char
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)
    return start_positions, end_positions

def chunking_encode(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    encoding = tokenizer(examples["questions"], examples["docs"], truncation="only_second",
                     max_length=max_length, return_overflowing_tokens=True, return_offsets_mapping=True, stride=doc_stride)
    

    overflow_mapping = np.array(encoding['overflow_to_sample_mapping'])

    for i in ['answers', 'ids', 'is_impossible', 'yesno', 'followups', 'answer_candidates']:
        encoding.update({i: np.array(examples[i])[overflow_mapping]})

    overflow_answers = np.array(examples['answers'])[overflow_mapping].tolist()
    #encoding.update({'answers': np.array(examples['answers'])[overflow_mapping]})

    start_positions, end_positions = add_token_positions(encoding, overflow_answers)
    start_positions, end_positions = add_token_positions_chunking(encoding, overflow_answers)
    encoding.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encoding

def synchronize(dataset, chunked):
    for i in ['answers', 'ids', 'is_impossible', 'yesno', 'followups']:
        overflow_mapping = np.array(chunked['overflow_to_sample_mapping'])
        chunked = chunked.add_column(i, np.array(dataset[i])[overflow_mapping])
    return chunked


In [4]:
train_dir = 'data/quac_train.json' 
val_dir = 'data/quac_dev.json' 
dataset = load_process_data(train_dir, val_dir)

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', cache_dir="bert_base/")
#tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', cache_dir="longformer/")

In [6]:
train =  dataset["train"].map(chunking_encode, batched=True, remove_columns=dataset["train"].column_names)
val =  dataset["validation"].map(chunking_encode, batched=True, remove_columns=dataset["validation"].column_names)

  0%|          | 0/84 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [7]:
def prepare_model(trainSet,valSet,tokenizer):

    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', cache_dir="bert_base/")
    #model = LongformerForQuestionAnswering.from_pretrained('allenai/longformer-base-4096', cache_dir="longformer/")
    model.cuda()

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print('preparing model...')

    args = TrainingArguments(
        "bert-base-cased-finetuned-quac",
        evaluation_strategy = "steps",
        eval_steps=10,
        save_strategy = "steps",
        learning_rate=2e-5,
        adafactor=True,
        per_device_train_batch_size=32,
        #gradient_accumulation_steps=4,
        per_device_eval_batch_size=20,
        logging_steps = 50,
        num_train_epochs=50,
        group_by_length=True,
        weight_decay=0.01,
        fp16=True,
        #warmup_ratio=0.02,
        save_total_limit = 3,
        load_best_model_at_end=True,
        report_to="wandb",
        run_name="bert-base-cased-finetuned-quac",
      )

    trainer_quac = Trainer(
        model=model,
        args=args,
        train_dataset=trainSet,
        eval_dataset=valSet,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
      )

    print('ready to train!')
    return trainer_quac

In [8]:
trainer_quac = prepare_model(train,val,tokenizer)
trainer_quac.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

preparing model...


Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping. If yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.


ready to train!


***** Running training *****
  Num examples = 123459
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 192950
  Number of trainable parameters = 108893186
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtan3[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/192950 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping. If yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11909
  Batch size = 20


  0%|          | 0/596 [00:00<?, ?it/s]

{'eval_loss': 4.989104747772217, 'eval_runtime': 40.1085, 'eval_samples_per_second': 296.92, 'eval_steps_per_second': 14.86, 'epoch': 0.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping. If yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11909
  Batch size = 20


  0%|          | 0/596 [00:00<?, ?it/s]

{'eval_loss': 3.8072826862335205, 'eval_runtime': 39.9033, 'eval_samples_per_second': 298.447, 'eval_steps_per_second': 14.936, 'epoch': 0.01}


The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping. If yesno, followups, is_impossible, answers, offset_mapping, ids, answer_candidates, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11909
  Batch size = 20


  0%|          | 0/596 [00:00<?, ?it/s]