In [1]:
import os
import json
import timeit
import random
import requests
from tqdm import tqdm

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from IPython.display import display, HTML

## Download the data

In [2]:
os.makedirs('./data/', exist_ok=True)

In [59]:
URL = 'https://raw.githubusercontent.com/Popescu-PfeifferMarc/ir-bioasq/refs/heads/master/dataset/training12b_new.json?token=GHSAT0AAAAAAC2O7C4H2NXRKDBW252PUUMKZ2CHVLA'
DATA_PATH = './data/training12B.json'

In [60]:
res = requests.get(URL)
try:
  data_dict = res.json()
except json.JSONDecodeError as e:
  print("Error parsing JSON: ", e)
  raise

with open(DATA_PATH, 'wb') as f:
  f.write(res.content)

with open(DATA_PATH, 'rb') as f:
  data_dict = json.load(f)

## Reading the data

In [61]:
data_dict.keys()

dict_keys(['questions'])

In [62]:
questions = []
answers = []
context = []

for question in data_dict['questions']:
    questions.append(question['body'])

    if question['snippets']:
        longest_entry = max(question['snippets'], key=lambda x: len(x["text"]))
        context.append(longest_entry['text'])

        answers.append(dict(
            text=question.get('ideal_answer', [None])[0],
            start_idx=longest_entry['offsetInBeginSection'],
            end_idx=longest_entry['offsetInEndSection']))

In [63]:
# questions = []
# answers = []
# context = []

# for q in data_dict['questions']:
#   questions.append(q['body'])
#   context.append(q['snippets'][0]['text'])

#   answers.append(dict(text=q['ideal_answer'][0],
#                       start_idx=q['snippets'][0]['offsetInBeginSection'],
#                       end_idx= q['snippets'][0]['offsetInEndSection']))

In [64]:
len(questions), len(answers), len(context)

(5049, 5049, 5049)

In [65]:
train_questions, val_questions, train_answers, val_answers, train_context, val_context = train_test_split(questions,
                                                                                                          answers,
                                                                                                          context,
                                                                                                          test_size=0.2,
                                                                                                          random_state=42)

In [66]:
test_questions, val_questions, test_answers, val_answers, test_context, val_context = train_test_split(val_questions,
                                                                                                       val_answers,
                                                                                                       val_context,
                                                                                                       test_size=0.5,
                                                                                                       random_state=42)

In [67]:
len(train_questions), len(val_questions), len(test_questions)

(4039, 505, 505)

In [68]:
len(train_answers), len(val_answers), len(test_answers)

(4039, 505, 505)

In [69]:
len(train_context), len(val_context), len(test_context)

(4039, 505, 505)

In [70]:
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

MODEL_NAME = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [71]:
train_encodings = tokenizer(train_context, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_context, val_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_context, test_questions, truncation=True, padding=True)

In [72]:
def add_token_positions(encodings, answers):
  # initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    # append start/end token position using char_to_token method
    # Check if start_idx is non-negative before calling char_to_token
    start_idx = answers[i]['start_idx']
    if start_idx >= 0:
      start_positions.append(encodings.char_to_token(i, start_idx))
    else:
      # Handle negative start_idx, e.g., set to 0 or skip
      start_positions.append(0)  # or None, depending on your logic

    # Check if end_idx is non-negative before calling char_to_token
    end_idx = answers[i]['end_idx']
    if end_idx >= 0:
      end_positions.append(encodings.char_to_token(i, end_idx))
    else:
      # Handle negative end_idx, e.g., set to 0 or skip
      end_positions.append(0)  # or None, depending on your logic

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    # end position cannot be found, char_to_token found space, so shift one token forward
    go_back = 1
    while end_positions[-1] is None:
      end_pos_idx = answers[i]['end_idx'] - go_back
      # Check if end_pos_idx is non-negative before calling char_to_token
      if end_pos_idx >= 0:
        end_positions[-1] = encodings.char_to_token(i, end_pos_idx)
      else:
        # Handle negative end_pos_idx, e.g., set to 0 or break
        end_positions[-1] = 0  # or break the loop
        break
      go_back += 1
  # update our encodings object with the new token-based start/end positions
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [73]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
add_token_positions(test_encodings, test_answers)

In [74]:
class BioASQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = BioASQDataset(train_encodings)
val_dataset = BioASQDataset(val_encodings)
test_dataset = BioASQDataset(test_encodings)

In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [76]:
# !rm -rf /content/drive/MyDrive/models/deepset

In [78]:
LEARNING_RATE = 5e-5
BATCH_SIZE = 16
EPOCHS = 3
MODEL_SAVE_PATH = f"/content/drive/MyDrive/models/{MODEL_NAME}-lr{LEARNING_RATE}-epoch{EPOCHS}-v2/"

In [79]:
transformers.utils.logging.set_verbosity_error()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [80]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

print(model.num_parameters())

124056578


In [81]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False)

In [82]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

start = timeit.default_timer()
for epoch in range(EPOCHS):
  model.train()
  train_running_loss = 0
  for idx, sample in enumerate(tqdm(train_dataloader, leave=True)):
    input_ids = sample['input_ids'].to(device)
    attention_mask = sample['attention_mask'].to(device)
    start_positions = sample['start_positions'].to(device)
    end_positions = sample['end_positions'].to(device)
    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions)

    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_running_loss += loss.item()

  train_loss = train_running_loss / (idx + 1)

  model.eval()
  val_running_loss = 0
  with torch.inference_mode():
    for idx, sample in enumerate(tqdm(val_dataloader)):
      input_ids = sample['input_ids'].to(device)
      attention_mask = sample['attention_mask'].to(device)
      start_positions = sample['start_positions'].to(device)
      end_positions = sample['end_positions'].to(device)
      outputs = model(input_ids=input_ids,
                      attention_mask=attention_mask,
                      start_positions=start_positions,
                      end_positions=end_positions)

      val_running_loss += outputs.loss.item()
    val_loss = val_running_loss / (idx + 1)

  print("-"*30)
  print(f"EPOCH: {epoch+1:02d} | Train Loss: {train_loss:.4f}")
  print(f"EPOCH: {epoch+1:02d} | Valid Loss: {val_loss:.4f}")
  print("-"*30)
  stop = timeit.default_timer()
  print(f"Training Time: {stop-start:.2f}s")

  model.save_pretrained(MODEL_SAVE_PATH)
  tokenizer.save_pretrained(MODEL_SAVE_PATH)

  torch.cuda.empty_cache()

100%|██████████| 253/253 [06:34<00:00,  1.56s/it]
100%|██████████| 32/32 [00:13<00:00,  2.38it/s]


------------------------------
EPOCH: 01 | Train Loss: 1.4969
EPOCH: 01 | Valid Loss: 1.1446
------------------------------
Training Time: 408.44s


100%|██████████| 253/253 [06:33<00:00,  1.56s/it]
100%|██████████| 32/32 [00:13<00:00,  2.39it/s]


------------------------------
EPOCH: 02 | Train Loss: 1.2079
EPOCH: 02 | Valid Loss: 1.1976
------------------------------
Training Time: 818.79s


100%|██████████| 253/253 [06:33<00:00,  1.56s/it]
100%|██████████| 32/32 [00:13<00:00,  2.39it/s]


------------------------------
EPOCH: 03 | Train Loss: 1.0674
EPOCH: 03 | Valid Loss: 1.1376
------------------------------
Training Time: 1229.30s


In [28]:
preds = []
true = []
running_accuracy = []

model.eval()
with torch.inference_mode():
  for idx, sample in enumerate(tqdm(test_dataloader, leave=True)):
    input_ids = sample['input_ids'].to(device)
    attention_mask = sample['attention_mask'].to(device)
    start_positions = sample['start_positions']
    end_positions = sample['end_positions']

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1).cpu().detach()
    end_pred = torch.argmax(outputs['end_logits'], dim=1).cpu().detach()

    preds.extend([[int(i), int(j)] for i, j in zip(start_pred, end_pred)])
    true.extend([[int(i), int(j)] for i, j in zip(start_positions, end_positions)])

    running_accuracy.append(((start_pred == start_positions).sum()/len(start_positions)).item())
    running_accuracy.append(((end_pred == end_positions).sum()/len(end_positions)).item())

preds = [item for sublist in preds for item in sublist]
true = [item for sublist in true for item in sublist]

accuracy = sum(running_accuracy)/len(running_accuracy) # average accuracy
f1_value = f1_score(true, preds, average="macro")
print(f"\nAccuracy: {accuracy*100:.2f}% | F1 Score: {f1_value*100:.2f}%")

100%|██████████| 32/32 [00:10<00:00,  3.14it/s]


Accuracy: 69.01% | F1 Score: 79.90%





In [29]:
def model_inference(question, context, model_path=MODEL_SAVE_PATH):
  model = AutoModelForQuestionAnswering.from_pretrained(model_path)
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  start = timeit.default_timer()
  qa_model = pipeline(task="question-answering", model=model, tokenizer=tokenizer)
  stop = timeit.default_timer()
  print(f"Inference Time: {stop-start:.2f}s")
  return qa_model(question=question, context=context)

In [30]:
def display_result(context, result):
  before_text = context[:result['start']]
  colored_text = context[result['start']:result['end']+1]
  after_text = context[result['end']+1:]
  # print(f"Answer: {result['answer']}")
  display(HTML(f"""<p style='font-size: 16px; width: 50%;'>{before_text}
    <span style='background-color: #33447f; color: white; width: {len(result["answer"])}em;'>{colored_text}</span>
    {after_text}</p>"""))

In [56]:
random_idx = random.randint(0, len(test_questions)-1)
question = test_questions[random_idx]
context = test_context[random_idx]
print(question)
result = model_inference(question, context)
result

What is Jackhammer esophagus?
Inference Time: 0.00s


{'score': 0.009454484097659588,
 'start': 122,
 'end': 134,
 'answer': 'peristalsis.'}

In [57]:
display_result(context, result)