In [3]:
# importing the json file to read the data in order to finetune the model
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:  # this is done according to Squad dataset
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    #print(questions[:5])
    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('more_answers_squad_data.json')
#val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [2]:
from transformers import pipeline

pipe = pipeline("question-answering", model="twmkn9/distilbert-base-uncased-squad2")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_answers[:2]

[{'answer_id': 951969,
  'document_id': 1582518,
  'question_id': 1064369,
  'text': 'Being able to assist customers remotely, service technicians could support their customers at any time and from anywhere, without necessarily having to visit the customer site. Remote access could also help to better prepare service technicians with the necessary information for their tasks. Knowing about the machine or system error in more detail beforehand saves valuable time because required spare parts or other equipment can ordered, prepared and brought along. Moreover, due to a shorter reaction time, the customer�s machines are up and running much faster, saving them from costly downtime.\n \nFor a machine and equipment manufacturer, using remote access also means that the same number of service technicians could support more customers or offer additional services.',
  'answer_start': 0,
  'answer_end': 779,
  'answer_category': None},
 {'answer_id': 976983,
  'document_id': 1582518,
  'question

In [4]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
#add_end_idx(val_answers, val_contexts)

In [5]:
train_answers[:5]

[{'answer_id': 951969,
  'document_id': 1582518,
  'question_id': 1064369,
  'text': 'Being able to assist customers remotely, service technicians could support their customers at any time and from anywhere, without necessarily having to visit the customer site. Remote access could also help to better prepare service technicians with the necessary information for their tasks. Knowing about the machine or system error in more detail beforehand saves valuable time because required spare parts or other equipment can ordered, prepared and brought along. Moreover, due to a shorter reaction time, the customer�s machines are up and running much faster, saving them from costly downtime.\n \nFor a machine and equipment manufacturer, using remote access also means that the same number of service technicians could support more customers or offer additional services.',
  'answer_start': 0,
  'answer_end': 779,
  'answer_category': None},
 {'answer_id': 976983,
  'document_id': 1582518,
  'question

In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("twmkn9/distilbert-base-uncased-squad2")


train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
#val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [5]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [7]:
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# import torch
# model_path = 'model/distilbert-custom'

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # Load the model
# model = AutoModelForQuestionAnswering.from_pretrained(model_path)
# train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [9]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [9]:
len(train_encodings['input_ids'])

163

In [10]:
train_encodings['input_ids'][0]

[101,
 2108,
 2583,
 2000,
 6509,
 6304,
 19512,
 1010,
 2326,
 20202,
 2071,
 2490,
 2037,
 6304,
 2012,
 2151,
 2051,
 1998,
 2013,
 5973,
 1010,
 2302,
 9352,
 2383,
 2000,
 3942,
 1996,
 8013,
 2609,
 1012,
 6556,
 3229,
 2071,
 2036,
 2393,
 2000,
 2488,
 7374,
 2326,
 20202,
 2007,
 1996,
 4072,
 2592,
 2005,
 2037,
 8518,
 1012,
 4209,
 2055,
 1996,
 3698,
 2030,
 2291,
 7561,
 1999,
 2062,
 6987,
 25828,
 13169,
 7070,
 2051,
 2138,
 3223,
 8622,
 3033,
 2030,
 2060,
 3941,
 2064,
 3641,
 1010,
 4810,
 1998,
 2716,
 2247,
 1012,
 9308,
 1010,
 2349,
 2000,
 1037,
 7820,
 4668,
 2051,
 1010,
 1996,
 6304,
 6681,
 2024,
 2039,
 1998,
 2770,
 2172,
 5514,
 1010,
 7494,
 2068,
 2013,
 17047,
 2091,
 7292,
 1012,
 2005,
 1037,
 3698,
 1998,
 3941,
 7751,
 1010,
 2478,
 6556,
 3229,
 2036,
 2965,
 2008,
 1996,
 2168,
 2193,
 1997,
 2326,
 20202,
 2071,
 2490,
 2062,
 6304,
 2030,
 3749,
 3176,
 2578,
 1012,
 1037,
 5851,
 6556,
 4434,
 2000,
 5500,
 10394,
 1998,
 3941,
 2003,
 2036,

In [10]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] being able to assist customers remotely, service technicians could support their customers at any time and from anywhere, without necessarily having to visit the customer site. remote access could also help to better prepare service technicians with the necessary information for their tasks. knowing about the machine or system error in more detail beforehand saves valuable time because required spare parts or other equipment can ordered, prepared and brought along. moreover, due to a shorter reaction time, the customers machines are up and running much faster, saving them from costly downtime. for a machine and equipment manufacturer, using remote access also means that the same number of service technicians could support more customers or offer additional services. a secure remote connection to distributed machinery and equipment is also the basis for many new concepts and services within industry 4. 0 such as predictive maintenance, where a secure connection is established to 

In [11]:
train_encodings.char_to_token(0, train_answers[0]['answer_start'])

1

In [7]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        go_back=1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - go_back)
            go_back+=1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
#add_token_positions(val_encodings, val_answers)

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [14]:
train_encodings['start_positions'][:10]

[1, 157, 181, 276, 40, 331, 1, 146, 1, 148]

In [8]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
#val_dataset = SquadDataset(val_encodings)

In [17]:
print(type(train_dataset))

<class '__main__.SquadDataset'>


In [18]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("twmkn9/distilbert-base-uncased-squad2")

In [9]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)



In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
model_path = 'model/distilbert-custom'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=1e-4)

for epoch in range(3):
    loop=tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.eval()

Epoch 0: 100%|██████████| 11/11 [35:44<00:00, 194.95s/it, loss=1.4] 
Epoch 1: 100%|██████████| 11/11 [45:32<00:00, 248.38s/it, loss=0.603] 
Epoch 2: 100%|██████████| 11/11 [30:21<00:00, 165.62s/it, loss=0.0496]


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [14]:
model_path='model/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('model/distilbert-custom\\tokenizer_config.json',
 'model/distilbert-custom\\special_tokens_map.json',
 'model/distilbert-custom\\vocab.txt',
 'model/distilbert-custom\\added_tokens.json',
 'model/distilbert-custom\\tokenizer.json')