In [2]:
from pathlib import Path
import json
import random
from sklearn.model_selection import train_test_split
import pandas as pd

In [16]:

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad = json.load(f)
        

    contexts = []
    questions = []
    answers= []
    for group in squad['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

contexts, questions, answers = read_squad('train-v2.0.json')
data={"contexts":contexts,"ques":questions,"ans":answers}
df = pd.DataFrame.from_dict(data)
train,test= train_test_split(df, test_size=0.2)
train_contexts=train["contexts"].values.tolist()
train_questions=train["ques"].values.tolist()
train_answers=train["ans"].values.tolist()


test_contexts=test["contexts"].values.tolist()
test_questions=test["ques"].values.tolist()
test_answers=test["ans"].values.tolist()

In [17]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)

In [18]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [19]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
                
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)

In [29]:
#train_dataset

<TensorSliceDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, {start_positions: (), end_positions: ()}), types: ({input_ids: tf.int32, attention_mask: tf.int32}, {start_positions: tf.int32, end_positions: tf.int32})>

# Keras did not work for me as there is some issues with the data input types

In [23]:

#from transformers import TFDistilBertForQuestionAnswering
#model1 = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_39', 'qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: train_encodings[key] for key in ['start_positions', 'end_positions']}
))


In [None]:
# Keras will expect a tuple when dealing with labels
train_dataset = train_dataset.map(lambda x, y: (x, (y['start_positions'], y['end_positions'])))

# Keras will assign a separate loss for each output and add them together. So we'll just use the standard CE loss
# instead of using the built-in model.compute_loss, which expects a dict of outputs and averages the two terms.
# Note that this means the loss will be 2x of when using TFTrainer since we're adding instead of averaging them.
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model1.distilbert.return_dict = False# if 7using 🤗 Transformers >3.02, make sure outputs are tuples

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model1.compile(optimizer=optimizer, loss=loss) # can also use any keras loss fn
model1.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

# Pytorch worked 

In [13]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [31]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)

In [32]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


  0%|                                                                                         | 0/4341 [00:00<?, ?it/s][A
Epoch 0:   0%|                                                                                | 0/4341 [00:47<?, ?it/s][A
Epoch 0:   0%|                                                                     | 0/4341 [00:47<?, ?it/s, loss=6.07][A
Epoch 0:   0%|                                                          | 1/4341 [00:47<57:08:26, 47.40s/it, loss=6.07][A
Epoch 0:   0%|                                                          | 1/4341 [01:32<57:08:26, 47.40s/it, loss=6.07][A
Epoch 0:   0%|                                                          | 1/4341 [01:32<57:08:26, 47.40s/it, loss=5.95][A
Epoch 0:   0%|                                                          | 2/4341 [01:32<56:15:41, 46.68s/it, loss=5.95][A
Epoch 0:   0%|                                                          | 2/4341 [02:17<56:15:41, 46.68s/it, loss=5.95][A
Epoch 0:   0%| 

Epoch 0:   1%|▌                                                        | 44/4341 [30:17<48:03:39, 40.27s/it, loss=3.83][A
Epoch 0:   1%|▌                                                        | 44/4341 [31:05<48:03:39, 40.27s/it, loss=3.83][A
Epoch 0:   1%|▌                                                        | 44/4341 [31:05<48:03:39, 40.27s/it, loss=4.04][A
Epoch 0:   1%|▌                                                        | 45/4341 [31:05<50:48:36, 42.58s/it, loss=4.04][A
Epoch 0:   1%|▌                                                        | 45/4341 [31:48<50:48:36, 42.58s/it, loss=4.04][A
Epoch 0:   1%|▌                                                        | 45/4341 [31:48<50:48:36, 42.58s/it, loss=3.36][A
Epoch 0:   1%|▌                                                        | 46/4341 [31:48<51:07:58, 42.86s/it, loss=3.36][A
Epoch 0:   1%|▌                                                        | 46/4341 [43:11<51:07:58, 42.86s/it, loss=3.36][A
Epoch 0:   1%|▌ 

Epoch 0:   2%|█                                                     | 88/4341 [1:44:58<150:56:58, 127.77s/it, loss=3.8][A
Epoch 0:   2%|█                                                     | 88/4341 [1:45:39<150:56:58, 127.77s/it, loss=3.8][A
Epoch 0:   2%|█                                                     | 88/4341 [1:45:39<150:56:58, 127.77s/it, loss=3.6][A
Epoch 0:   2%|█                                                     | 89/4341 [1:45:39<120:18:54, 101.87s/it, loss=3.6][A
Epoch 0:   2%|█                                                     | 89/4341 [1:46:20<120:18:54, 101.87s/it, loss=3.6][A
Epoch 0:   2%|█                                                    | 89/4341 [1:46:20<120:18:54, 101.87s/it, loss=3.36][A
Epoch 0:   2%|█▏                                                     | 90/4341 [1:46:20<98:26:57, 83.37s/it, loss=3.36][A
Epoch 0:   2%|█▏                                                     | 90/4341 [1:47:02<98:26:57, 83.37s/it, loss=3.36][A
Epoch 0:   2%|█▏

Epoch 0:   3%|█▋                                                    | 132/4341 [2:16:38<48:44:37, 41.69s/it, loss=3.76][A
Epoch 0:   3%|█▋                                                    | 132/4341 [2:17:25<48:44:37, 41.69s/it, loss=3.76][A
Epoch 0:   3%|█▋                                                     | 132/4341 [2:17:25<48:44:37, 41.69s/it, loss=3.4][A
Epoch 0:   3%|█▋                                                     | 133/4341 [2:17:25<50:27:32, 43.17s/it, loss=3.4][A
Epoch 0:   3%|█▋                                                     | 133/4341 [2:18:09<50:27:32, 43.17s/it, loss=3.4][A
Epoch 0:   3%|█▋                                                    | 133/4341 [2:18:09<50:27:32, 43.17s/it, loss=3.48][A
Epoch 0:   3%|█▋                                                    | 134/4341 [2:18:09<50:44:58, 43.43s/it, loss=3.48][A
Epoch 0:   3%|█▋                                                    | 134/4341 [2:18:52<50:44:58, 43.43s/it, loss=3.48][A
Epoch 0:   3%|█▋

KeyboardInterrupt: 