In [3]:
import os
import glob
import json
import timeit
import random
import requests
import string, re
from tqdm import tqdm

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from IPython.display import display, HTML

## Download dataset

### Dataset reference: https://github.com/dmis-lab/biobert

In [16]:
!wget http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz

--2024-11-25 21:21:06--  http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz
Resolving nlp.dmis.korea.edu (nlp.dmis.korea.edu)... 163.152.163.168
Connecting to nlp.dmis.korea.edu (nlp.dmis.korea.edu)|163.152.163.168|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29610233 (28M) [application/x-gzip]
Saving to: ‘datasets.tar.gz’


2024-11-25 21:21:48 (724 KB/s) - ‘datasets.tar.gz’ saved [29610233/29610233]



In [17]:
!tar -xzvf datasets.tar.gz

datasets/
datasets/RE/
datasets/RE/GAD/
datasets/RE/GAD/6/
datasets/RE/GAD/6/test.tsv
datasets/RE/GAD/6/dev.tsv
datasets/RE/GAD/6/train.tsv
datasets/RE/GAD/7/
datasets/RE/GAD/7/test.tsv
datasets/RE/GAD/7/dev.tsv
datasets/RE/GAD/7/train.tsv
datasets/RE/GAD/5/
datasets/RE/GAD/5/test.tsv
datasets/RE/GAD/5/dev.tsv
datasets/RE/GAD/5/train.tsv
datasets/RE/GAD/8/
datasets/RE/GAD/8/test.tsv
datasets/RE/GAD/8/dev.tsv
datasets/RE/GAD/8/train.tsv
datasets/RE/GAD/4/
datasets/RE/GAD/4/test.tsv
datasets/RE/GAD/4/dev.tsv
datasets/RE/GAD/4/train.tsv
datasets/RE/GAD/1/
datasets/RE/GAD/1/test.tsv
datasets/RE/GAD/1/dev.tsv
datasets/RE/GAD/1/train.tsv
datasets/RE/GAD/2/
datasets/RE/GAD/2/test.tsv
datasets/RE/GAD/2/dev.tsv
datasets/RE/GAD/2/train.tsv
datasets/RE/GAD/3/
datasets/RE/GAD/3/test.tsv
datasets/RE/GAD/3/dev.tsv
datasets/RE/GAD/3/train.tsv
datasets/RE/GAD/9/
datasets/RE/GAD/9/test.tsv
datasets/RE/GAD/9/dev.tsv
datasets/RE/GAD/9/train.tsv
datasets/RE/GAD/10/
datasets/RE/GAD/10/test.tsv
datasets/RE/

In [18]:
!rm -rf datasets.tar.gz

In [19]:
# with open('/content/datasets/QA/BioASQ/BioASQ-train-factoid-4b.json') as f:
#   data = json.load(f)

In [20]:
# data.keys()

In [21]:
# data['data'][0].keys()

In [22]:
# data['data'][0]['paragraphs'][0]

In [23]:
def extract_data(file_path):

  questions_list = []
  context_list = []
  answers_list = []

  with open(file_path) as f:
    data = json.load(f)

  questions = data['data'][0]['paragraphs']

  for question in questions:
    context = question['context']
    for qna in question['qas']:
      question = qna['question']
      answers = qna['answers']
      for ans in answers:
        answer = ans['text']
        start_idx = ans['answer_start']
        end_idx = start_idx + len(answer)
        questions_list.append(question)
        context_list.append(context)
        answers_list.append(dict(text=answer, start_idx=start_idx, end_idx=end_idx))

  return questions_list, context_list, answers_list

In [24]:
for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-train-factoid-*'):
  print(path)

/content/datasets/QA/BioASQ/BioASQ-train-factoid-6b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-4b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-5b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json


In [25]:
questions = []
context = []
answers = []

for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-train-factoid-*'):
  questions, context, answers = extract_data(path)
  questions.extend(questions)
  context.extend(context)
  answers.extend(answers)

In [26]:
len(questions), len(context), len(answers)

(8462, 8462, 8462)

In [27]:
questions[0], context[0], answers[0]

('Name synonym of Acrokeratosis paraneoplastica.',
 'Acrokeratosis paraneoplastica (Bazex syndrome): report of a case associated with small cell lung carcinoma and review of the literature.',
 {'text': 'Bazex syndrome', 'start_idx': 31, 'end_idx': 45})

In [28]:
train_questions, val_questions, train_answers, val_answers, train_context, val_context = train_test_split(questions,
                                                                                                          answers,
                                                                                                          context,
                                                                                                          test_size=0.1,
                                                                                                          random_state=42)

In [29]:
len(train_questions), len(val_questions), len(train_answers), len(val_answers), len(train_context), len(val_context)

(7615, 847, 7615, 847, 7615, 847)

In [30]:
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

MODEL_NAME = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [32]:
train_encodings = tokenizer(train_context, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_context, val_questions, truncation=True, padding=True)
# test_encodings = tokenizer(test_context, test_questions, truncation=True, padding=True)

In [33]:
def add_token_positions(encodings, answers):
  # initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    # append start/end token position using char_to_token method
    # Check if start_idx is non-negative before calling char_to_token
    start_idx = answers[i]['start_idx']
    if start_idx >= 0:
      start_positions.append(encodings.char_to_token(i, start_idx))
    else:
      # Handle negative start_idx, e.g., set to 0 or skip
      start_positions.append(0)  # or None, depending on your logic

    # Check if end_idx is non-negative before calling char_to_token
    end_idx = answers[i]['end_idx']
    if end_idx >= 0:
      end_positions.append(encodings.char_to_token(i, end_idx))
    else:
      # Handle negative end_idx, e.g., set to 0 or skip
      end_positions.append(0)  # or None, depending on your logic

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    # end position cannot be found, char_to_token found space, so shift one token forward
    go_back = 1
    while end_positions[-1] is None:
      end_pos_idx = answers[i]['end_idx'] - go_back
      # Check if end_pos_idx is non-negative before calling char_to_token
      if end_pos_idx >= 0:
        end_positions[-1] = encodings.char_to_token(i, end_pos_idx)
      else:
        # Handle negative end_pos_idx, e.g., set to 0 or break
        end_positions[-1] = 0  # or break the loop
        break
      go_back += 1
  # update our encodings object with the new token-based start/end positions
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [34]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
# add_token_positions(test_encodings, test_answers)

In [35]:
class BioASQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = BioASQDataset(train_encodings)
val_dataset = BioASQDataset(val_encodings)
# test_dataset = BioASQDataset(test_encodings)

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# !rm -rf /content/drive/MyDrive/models/deepset

In [33]:
VERSION = 3
LEARNING_RATE = 5e-5
BATCH_SIZE = 16
EPOCHS = 5
MODEL_SAVE_PATH = f"/content/drive/MyDrive/models/{MODEL_NAME}-lr{LEARNING_RATE}-epoch{EPOCHS}-v{VERSION}/"

In [34]:
transformers.utils.logging.set_verbosity_error()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [35]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
print(model.num_parameters())

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

124056578


In [41]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

# test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                               batch_size=BATCH_SIZE,
#                                               shuffle=False)

In [45]:
device

device(type='cuda')

In [46]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

start = timeit.default_timer()
for epoch in range(EPOCHS):
  model.train()
  train_running_loss = 0
  for idx, sample in enumerate(tqdm(train_dataloader, leave=True)):
    input_ids = sample['input_ids'].to(device)
    attention_mask = sample['attention_mask'].to(device)
    start_positions = sample['start_positions'].to(device)
    end_positions = sample['end_positions'].to(device)
    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions)

    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_running_loss += loss.item()

  train_loss = train_running_loss / (idx + 1)

  model.eval()
  val_running_loss = 0
  with torch.inference_mode():
    for idx, sample in enumerate(tqdm(val_dataloader)):
      input_ids = sample['input_ids'].to(device)
      attention_mask = sample['attention_mask'].to(device)
      start_positions = sample['start_positions'].to(device)
      end_positions = sample['end_positions'].to(device)
      outputs = model(input_ids=input_ids,
                      attention_mask=attention_mask,
                      start_positions=start_positions,
                      end_positions=end_positions)

      val_running_loss += outputs.loss.item()
    val_loss = val_running_loss / (idx + 1)

  print("-"*30)
  print(f"EPOCH: {epoch+1:02d} | Train Loss: {train_loss:.4f}")
  print(f"EPOCH: {epoch+1:02d} | Valid Loss: {val_loss:.4f}")
  print("-"*30)
  stop = timeit.default_timer()
  print(f"Training Time: {stop-start:.2f}s")

  model.save_pretrained(MODEL_SAVE_PATH)
  tokenizer.save_pretrained(MODEL_SAVE_PATH)

  torch.cuda.empty_cache()

100%|██████████| 537/537 [13:25<00:00,  1.50s/it]
100%|██████████| 60/60 [00:27<00:00,  2.20it/s]


------------------------------
EPOCH: 01 | Train Loss: 1.6552
EPOCH: 01 | Valid Loss: 1.1573
------------------------------
Training Time: 832.87s


100%|██████████| 537/537 [13:36<00:00,  1.52s/it]
100%|██████████| 60/60 [00:27<00:00,  2.21it/s]


------------------------------
EPOCH: 02 | Train Loss: 1.0782
EPOCH: 02 | Valid Loss: 1.0027
------------------------------
Training Time: 1680.22s


100%|██████████| 537/537 [13:36<00:00,  1.52s/it]
100%|██████████| 60/60 [00:27<00:00,  2.20it/s]


------------------------------
EPOCH: 03 | Train Loss: 0.9433
EPOCH: 03 | Valid Loss: 0.8851
------------------------------
Training Time: 2526.01s


100%|██████████| 537/537 [13:35<00:00,  1.52s/it]
100%|██████████| 60/60 [00:27<00:00,  2.21it/s]


------------------------------
EPOCH: 04 | Train Loss: 0.8578
EPOCH: 04 | Valid Loss: 0.8831
------------------------------
Training Time: 3372.41s


100%|██████████| 537/537 [13:36<00:00,  1.52s/it]
100%|██████████| 60/60 [00:27<00:00,  2.21it/s]


------------------------------
EPOCH: 05 | Train Loss: 0.8081
EPOCH: 05 | Valid Loss: 0.9109
------------------------------
Training Time: 4220.35s


## Evalation Exact Match and F1-score

In [53]:
def get_answers(index):
  question = val_questions[index]
  context = val_context[index]
  true_answer = val_answers[index]['text']

  tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

  model = AutoModelForQuestionAnswering.from_pretrained(MODEL_SAVE_PATH)
  outputs = model(**inputs)
  answer_start = torch.argmax(outputs[0])
  answer_end = torch.argmax(outputs[1]) + 1

  pred_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
  return true_answer, pred_answer

In [54]:
def normalize_text(text):
  regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) # remove articles
  text = " ".join(re.sub(regex, " ", text).split()) # fix white spase
  text = "".join(ch for ch in text if ch not in set(string.punctuation)) # remove punctuations
  return text.lower()

In [56]:
def compute_exact_match(prediction, truth):
  return int(normalize_text(prediction) == normalize_text(truth))

In [57]:
def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return 2 * (prec * rec) / (prec + rec)

In [65]:
random_idx = random.randint(0, len(val_questions)-1)
true_ans, pred_ans = get_answers(random_idx)
exact_match = compute_exact_match(pred_ans, true_ans)
f1 = compute_f1(pred_ans, true_ans)

print(f"Question: {val_questions[random_idx]}")
print(f"True Answer: {true_ans}")
print(f"Predicted Answer: {pred_ans}")
print(f"Exact Match: {exact_match}")
print(f"F1 Score: {f1}")

Question: Which kinase is inhibited by Tripolin A?
True Answer: Aurora A
Predicted Answer:  Aurora A
Exact Match: 1
F1 Score: 1.0


In [59]:
exact_match_list = []
f1_list = []
for i in range(len(val_questions)-1):
  true_ans, pred_ans = get_answers(i)
  exact_match = compute_exact_match(pred_ans, true_ans)
  f1 = compute_f1(pred_ans, true_ans)
  exact_match_list.append(exact_match)
  f1_list.append(f1)

In [60]:
avg_em = sum(exact_match_list) / len(exact_match_list)
avg_f1 = sum(f1_list) / len(f1_list)

print(f"Average Exact Match: {avg_em}")
print(f"Average F1 Score: {avg_f1}")

Average Exact Match: 0.8167848699763594
Average F1 Score: 0.8560528322123386


## Inference with test data

In [66]:
test_questions = []
test_context = []

for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-test-factoid-*'):
  with open(path) as f:
    test_data = json.load(f)
    for q in test_data['data'][0]['paragraphs']:
      test_questions.append(q['qas'][0]['question'])
      test_context.append(q['context'])

In [67]:
len(test_questions), len(test_context)

(2184, 2184)

In [69]:
test_questions[0], test_context[0]

('What is the combined effect of Nfat and miR-25?',
 'Nfat and miR-25 cooperate to reactivate the transcription factor Hand2 in heart failure.')

In [70]:
def model_inference(question, context, model_path=MODEL_SAVE_PATH):
  model = AutoModelForQuestionAnswering.from_pretrained(model_path)
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  start = timeit.default_timer()
  qa_model = pipeline(task="question-answering", model=model, tokenizer=tokenizer)
  stop = timeit.default_timer()
  print(f"Inference Time: {stop-start:.2f}s")
  return qa_model(question=question, context=context)

In [71]:
def display_result(context, result):
  before_text = context[:result['start']]
  colored_text = context[result['start']:result['end']+1]
  after_text = context[result['end']+1:]
  # print(f"Answer: {result['answer']}")
  display(HTML(f"""<p style='font-size: 16px; width: 50%;'>{before_text}
    <span style='background-color: #33447f; color: white; width: {len(result["answer"])}em;'>{colored_text}</span>
    {after_text}</p>"""))

In [111]:
random_idx = random.randint(0, len(test_questions)-1)
question = test_questions[random_idx]
context = test_context[random_idx]
print(question)
result = model_inference(question, context)
result

What distinguishes lantibiotics from antibiotics?
Inference Time: 0.00s


{'score': 0.17087984085083008,
 'start': 109,
 'end': 154,
 'answer': 'Lantibiotics are biologically active peptides'}

In [112]:
display_result(context, result)