In [None]:
import os
import requests
import json
if not os.path.exists('data'):
    os.mkdir('/data')

In [None]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

In [None]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'data/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

---

# Data Prep

In [None]:
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    #we are gonna iterate through the squad dataset and get the par, context and a data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_squad('data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json')

# Adding end token for bert tokenization

Here we want to find the end token for each character, ideally it should be context strt + len of golden answer = text. However not always the case because someties the answer start is off by 1-2 indecies.

In [None]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [None]:
#add endindex to train and validation dataset
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

# **Tokenize/Encode**

In [None]:
from transformers import DistilBertTokenizerFast #(smaller faster version of bert)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

#This will merge the 2 strings together context <pad> answer and this will be fed in our model|
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)


BERT expects 512 toekns to be fed in for everysample

In [None]:
def add_token_positions(encodings, answers, rng):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers,5000) ##change this back to len(train_answer) if you use full dataset
add_token_positions(val_encodings, val_answers, 3000)

In [None]:
#we want to create a pytorch dataset object
import torch

class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self,idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

---

# Fine Tuning

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm #Traning bar


In [None]:
from transformers import DistilBertForQuestionAnswering
device = torch.device('cuda')

In [None]:
from transformers import DistilBertForQuestionAnswering
device = torch.device('cuda')
#modelp1 = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
modelp1 = DistilBertForQuestionAnswering.from_pretrained('model1')
modelp1.to(device) #--temp disabled to train bi-LSTM
modelp1.train()
optim = AdamW(modelp1.parameters(), lr=5e-5)
#print(modelp1.parameters())



In [None]:
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)

In [None]:
for epoch in range(6):
  loop = tqdm(train_loader)
  for batch in loop:
    optim.zero_grad() #We always want to reset our grad to zero

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    outputs = modelp1(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

    loss = outputs[0]
        # calculate loss for every parameter that needs grad update
    loss.backward()
    # update parameters
    optim.step()
    # print relevant info to progress bar
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

Epoch 0:   1%|          | 36/5000 [00:02<06:00, 13.76it/s, loss=3.82]


KeyboardInterrupt: ignored

In [None]:
model_path = 'model1'
modelp1.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('model1/tokenizer_config.json',
 'model1/special_tokens_map.json',
 'model1/vocab.txt',
 'model1/added_tokens.json',
 'model1/tokenizer.json')

# To test BERT model (Optional Step)

In [None]:
# switch model out of training mode
modelp1.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = modelp1(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)
print(acc)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
#When passing data to LSTM or GRU batch_size must be 1 or it wont work
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
def cls_pooling(model_output):
  ans = []
  for i in model_output:
    ans.append(i[0][:,0])
  return ans



# Using LSTM

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from transformers import BertForQuestionAnswering

from transformers import DistilBertForQuestionAnswering

class BERTLSTMModel(nn.Module):
    def __init__(self, input_len, hidden_size, num_labels):
        super(BERTLSTMModel, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased').to(device)
        self.lstm = nn.LSTM(input_size=input_len,
                            hidden_size=hidden_size,
                            batch_first=True,
                            bidirectional=True).to(device)
        #used to find relevant span of text
        self.fc_start = nn.Linear(hidden_size * 2, num_labels).to(device)
        self.fc_end = nn.Linear(hidden_size * 2, num_labels).to(device)

    #def forward(self, input_ids, bert_output):
    def forward(self, end_logit):

        lstm_out, _ = self.lstm(end_logit)
        lstm_out_last = lstm_out[:, -1, :]

        logits_start = self.fc_start(lstm_out_last)
        logits_end = self.fc_end(lstm_out_last)
        return logits_start, logits_end

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#distilmodel = modelp1
#distilmodel = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
distilmodel = DistilBertForQuestionAnswering.from_pretrained('modelp1')
distilmodel.to(device) #--temp disabled to train bi-LSTM
#distilmodel.train()
#optim = AdamW(model.parameters(), lr=5e-5)
total_loss = 0
model = BERTLSTMModel(input_len=512,hidden_size=5, num_labels=(512))

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
model.train()
total_loss = 0
# Tokenize input
#inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
for epoch in range(1):
  loop = tqdm(train_loader)
  total_loss = 0
  for batch in loop:
    optimizer.zero_grad() #We always want to reset our grad to zero

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    # outputs = distilmodel(input_ids, attention_mask=attention_mask,
    #                     start_positions=start_positions,
    #                     end_positions=end_positions)

    outputs = distilmodel(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)


    #sentence_embeddings = cls_pooling(outputs.start_logits)
    #end_logits = sentence_embeddings
    end_logits = outputs.end_logits

    logits_start, logits_end = model(end_logits.unsqueeze(0))
    # Compute loss for start positions
    loss_start = criterion(logits_start, start_positions)

    # Compute loss for end positions
    loss_end = criterion(logits_end, end_positions)

    # Total loss is the sum of start and end losses
    total_loss += (loss_start + loss_end).item()

    # Backpropagation and optimization
    (loss_start + loss_end).backward()
    optimizer.step()

  average_loss = total_loss / len(train_loader)
  print(f"Loss: {average_loss}")

# model = BERTLSTMModel(hidden_size=128, num_labels=768)  # Replace num_labels with the actual number of classes
# logits = model(train_loader['input_ids'], train_loader['attention_mask'])
# print(logits)

 10%|█         | 520/5000 [00:30<04:22, 17.07it/s]


RuntimeError: ignored

# Using GRU

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from transformers import BertForQuestionAnswering

from transformers import DistilBertForQuestionAnswering

class BERTGRUModel(nn.Module):
    def __init__(self, input_len, hidden_size, num_labels):
        super(BERTGRUModel, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased').to(device)
        self.lstm = nn.GRU(input_size=input_len,
                            hidden_size=hidden_size,
                            batch_first=True,
                            bidirectional=True).to(device)
        #used to find relevant span of text
        self.fc_start = nn.Linear(hidden_size * 2, num_labels).to(device)
        self.fc_end = nn.Linear(hidden_size * 2, num_labels).to(device)

    #def forward(self, input_ids, bert_output):
    def forward(self, end_logit):

        lstm_out, _ = self.lstm(end_logit)
        lstm_out_last = lstm_out[:, -1, :]

        logits_start = self.fc_start(lstm_out_last)
        logits_end = self.fc_end(lstm_out_last)
        return logits_start, logits_end

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#distilmodel = modelp1
#distilmodel = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
distilmodel = DistilBertForQuestionAnswering.from_pretrained('modelp1')
distilmodel.to(device) #--temp disabled to train bi-LSTM
#distilmodel.train()
#optim = AdamW(model.parameters(), lr=5e-5)
total_loss = 0
model = BERTGRUModel(input_len=512,hidden_size=50, num_labels=(512))

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
model.train()
total_loss = 0
# Tokenize input
#inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
for epoch in range(1):
  loop = tqdm(train_loader)
  total_loss = 0
  for batch in loop:
    optimizer.zero_grad() #We always want to reset our grad to zero

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    # outputs = distilmodel(input_ids, attention_mask=attention_mask,
    #                     start_positions=start_positions,
    #                     end_positions=end_positions)

    outputs = distilmodel(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)


    #sentence_embeddings = cls_pooling(outputs.start_logits)
    #end_logits = sentence_embeddings
    end_logits = outputs.end_logits

    logits_start, logits_end = model(end_logits.unsqueeze(0))
    # Compute loss for start positions
    loss_start = criterion(logits_start, start_positions)

    # Compute loss for end positions
    loss_end = criterion(logits_end, end_positions)

    # Total loss is the sum of start and end losses
    total_loss += (loss_start + loss_end).item()

    # Backpropagation and optimization
    (loss_start + loss_end).backward()
    optimizer.step()

  average_loss = total_loss / len(train_loader)
  print(f"Loss: {average_loss}")

# model = BERTLSTMModel(hidden_size=128, num_labels=768)  # Replace num_labels with the actual number of classes
# logits = model(train_loader['input_ids'], train_loader['attention_mask'])
# print(logits)

In [None]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=1)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
i = 0
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        i+=1
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = modelp1(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        end_logits = outputs.end_logits
        logits_start, logits_end = model(end_logits.unsqueeze(0))
        # pull preds out
        start_pred = torch.argmax(logits_start, dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())


# calculate average accuracy in total
acc = sum(acc)/len(acc)

print(acc)


100%|██████████| 1269/1269 [06:02<00:00,  3.50it/s]

0.6432335641326117





In [None]:
torch.cuda.empty_cache()

In [None]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.8.6
aiosignal                        1.3.1
alabaster                        0.7.13
albumentations                   1.3.1
altair                           4.2.2
anyio                            3.7.1
appdirs                          1.4.4
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array-record                     0.5.0
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.0
attrs                            23.1.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.13.1
backcall                         0.2.0
beautifulsoup4                   4.11.2
bidict                           0.22.1
b