In [1]:
import os
import glob
import json
import timeit
import random
import requests
from tqdm import tqdm

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from IPython.display import display, HTML

## Download the data

In [2]:
# os.makedirs('./data/', exist_ok=True)

In [3]:
# URL = 'https://raw.githubusercontent.com/Popescu-PfeifferMarc/ir-bioasq/refs/heads/master/dataset/training12b_new.json?token=GHSAT0AAAAAAC2O7C4GR5LW5AMT3KY4M6M4Z2CJTGQ'
# DATA_PATH = './data/training12B.json'

In [4]:
# res = requests.get(URL)
# try:
#   data_dict = res.json()
# except json.JSONDecodeError as e:
#   print("Error parsing JSON: ", e)
#   raise

# with open(DATA_PATH, 'wb') as f:
#   f.write(res.content)

# with open(DATA_PATH, 'rb') as f:
#   data_dict = json.load(f)

## Reading the data

In [5]:
# data_dict.keys()

In [6]:
# questions = []
# answers = []
# context = []

# for question in data_dict['questions']:
#     questions.append(question['body'])

#     if question['snippets']:
#         longest_entry = max(question['snippets'], key=lambda x: len(x["text"]))
#         context.append(longest_entry['text'])

#         answers.append(dict(
#             text=question.get('ideal_answer', [None])[0],
#             start_idx=longest_entry['offsetInBeginSection'],
#             end_idx=longest_entry['offsetInEndSection']))

In [7]:
# questions = []
# answers = []
# context = []

# for q in data_dict['questions']:
#   questions.append(q['body'])
#   context.append(q['snippets'][0]['text'])

#   answers.append(dict(text=q['ideal_answer'][0],
#                       start_idx=q['snippets'][0]['offsetInBeginSection'],
#                       end_idx= q['snippets'][0]['offsetInEndSection']))

In [8]:
# len(questions), len(answers), len(context)

In [9]:
# train_questions, val_questions, train_answers, val_answers, train_context, val_context = train_test_split(questions,
#                                                                                                           answers,
#                                                                                                           context,
#                                                                                                           test_size=0.2,
#                                                                                                           random_state=42)

In [10]:
# test_questions, val_questions, test_answers, val_answers, test_context, val_context = train_test_split(val_questions,
#                                                                                                        val_answers,
#                                                                                                        val_context,
#                                                                                                        test_size=0.5,
#                                                                                                        random_state=42)

In [11]:
# len(train_questions), len(val_questions), len(test_questions)

In [12]:
# len(train_answers), len(val_answers), len(test_answers)

In [13]:
# len(train_context), len(val_context), len(test_context)

## Download

### Dataset reference: https://github.com/dmis-lab/biobert

In [14]:
!wget http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz

--2024-11-24 16:48:31--  http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz
Resolving nlp.dmis.korea.edu (nlp.dmis.korea.edu)... 163.152.163.168
Connecting to nlp.dmis.korea.edu (nlp.dmis.korea.edu)|163.152.163.168|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29610233 (28M) [application/x-gzip]
Saving to: ‘datasets.tar.gz’


2024-11-24 16:49:28 (520 KB/s) - ‘datasets.tar.gz’ saved [29610233/29610233]



In [15]:
!tar -xzvf datasets.tar.gz

datasets/
datasets/RE/
datasets/RE/GAD/
datasets/RE/GAD/6/
datasets/RE/GAD/6/test.tsv
datasets/RE/GAD/6/dev.tsv
datasets/RE/GAD/6/train.tsv
datasets/RE/GAD/7/
datasets/RE/GAD/7/test.tsv
datasets/RE/GAD/7/dev.tsv
datasets/RE/GAD/7/train.tsv
datasets/RE/GAD/5/
datasets/RE/GAD/5/test.tsv
datasets/RE/GAD/5/dev.tsv
datasets/RE/GAD/5/train.tsv
datasets/RE/GAD/8/
datasets/RE/GAD/8/test.tsv
datasets/RE/GAD/8/dev.tsv
datasets/RE/GAD/8/train.tsv
datasets/RE/GAD/4/
datasets/RE/GAD/4/test.tsv
datasets/RE/GAD/4/dev.tsv
datasets/RE/GAD/4/train.tsv
datasets/RE/GAD/1/
datasets/RE/GAD/1/test.tsv
datasets/RE/GAD/1/dev.tsv
datasets/RE/GAD/1/train.tsv
datasets/RE/GAD/2/
datasets/RE/GAD/2/test.tsv
datasets/RE/GAD/2/dev.tsv
datasets/RE/GAD/2/train.tsv
datasets/RE/GAD/3/
datasets/RE/GAD/3/test.tsv
datasets/RE/GAD/3/dev.tsv
datasets/RE/GAD/3/train.tsv
datasets/RE/GAD/9/
datasets/RE/GAD/9/test.tsv
datasets/RE/GAD/9/dev.tsv
datasets/RE/GAD/9/train.tsv
datasets/RE/GAD/10/
datasets/RE/GAD/10/test.tsv
datasets/RE/

In [16]:
!rm -rf datasets.tar.gz

In [17]:
# with open('/content/datasets/QA/BioASQ/BioASQ-train-factoid-4b.json') as f:
#   data = json.load(f)

In [18]:
# data.keys()

In [19]:
# data['data'][0].keys()

In [20]:
# data['data'][0]['paragraphs'][0]

In [24]:
def extract_data(file_path):

  questions_list = []
  context_list = []
  answers_list = []

  with open(file_path) as f:
    data = json.load(f)

  questions = data['data'][0]['paragraphs']

  for question in questions:
    context = question['context']
    for qna in question['qas']:
      question = qna['question']
      answers = qna['answers']
      for ans in answers:
        answer = ans['text']
        start_idx = ans['answer_start']
        end_idx = start_idx + len(answer)
        questions_list.append(question)
        context_list.append(context)
        answers_list.append(dict(text=answer, start_idx=start_idx, end_idx=end_idx))

  return questions_list, context_list, answers_list

In [25]:
for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-train-factoid*'):
  print(path)

/content/datasets/QA/BioASQ/BioASQ-train-factoid-4b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-6b.json
/content/datasets/QA/BioASQ/BioASQ-train-factoid-5b.json


In [26]:
questions = []
context = []
answers = []

for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-train-factoid-*'):
  questions, context, answers = extract_data(path)
  questions.extend(questions)
  context.extend(context)
  answers.extend(answers)

In [27]:
len(questions), len(context), len(answers)

(9900, 9900, 9900)

In [28]:
questions[0], context[0], answers[0]

('Name synonym of Acrokeratosis paraneoplastica.',
 'Acrokeratosis paraneoplastica (Bazex syndrome): report of a case associated with small cell lung carcinoma and review of the literature. Acrokeratosis paraneoplastic (Bazex syndrome) is a rare, but distinctive paraneoplastic dermatosis characterized by erythematosquamous lesions located at the acral sites and is most commonly associated with carcinomas of the upper aerodigestive tract. We report a 58-year-old female with a history of a pigmented rash on her extremities, thick keratotic plaques on her hands, and brittle nails. Chest imaging revealed a right upper lobe mass that was proven to be small cell lung carcinoma. While Bazex syndrome has been described in the dermatology literature, it is also important for the radiologist to be aware of this entity and its common presentations.',
 {'text': 'Bazex syndrome', 'start_idx': 31, 'end_idx': 45})

In [29]:
train_questions, val_questions, train_answers, val_answers, train_context, val_context = train_test_split(questions,
                                                                                                          answers,
                                                                                                          context,
                                                                                                          test_size=0.1,
                                                                                                          random_state=42)

In [30]:
len(train_questions), len(val_questions), len(train_answers), len(val_answers), len(train_context), len(val_context)

(8910, 990, 8910, 990, 8910, 990)

In [31]:
# test_questions, val_questions, test_answers, val_answers, test_context, val_context = train_test_split(val_questions,
#                                                                                                        val_answers,
#                                                                                                        val_context,
#                                                                                                        test_size=0.5,
#                                                                                                        random_state=42)

In [32]:
# len(train_questions), len(test_questions), len(val_questions)

In [33]:
# test_questions = []
# test_context = []
# test_answers = []

# for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-test-factoid-*'):
#   questions, context, answers = extract_data(path)
#   test_questions.extend(questions)
#   test_context.extend(context)
#   test_answers.extend(answers)

In [34]:
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

MODEL_NAME = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [35]:
train_encodings = tokenizer(train_context, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_context, val_questions, truncation=True, padding=True)
# test_encodings = tokenizer(test_context, test_questions, truncation=True, padding=True)

In [36]:
def add_token_positions(encodings, answers):
  # initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    # append start/end token position using char_to_token method
    # Check if start_idx is non-negative before calling char_to_token
    start_idx = answers[i]['start_idx']
    if start_idx >= 0:
      start_positions.append(encodings.char_to_token(i, start_idx))
    else:
      # Handle negative start_idx, e.g., set to 0 or skip
      start_positions.append(0)  # or None, depending on your logic

    # Check if end_idx is non-negative before calling char_to_token
    end_idx = answers[i]['end_idx']
    if end_idx >= 0:
      end_positions.append(encodings.char_to_token(i, end_idx))
    else:
      # Handle negative end_idx, e.g., set to 0 or skip
      end_positions.append(0)  # or None, depending on your logic

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    # end position cannot be found, char_to_token found space, so shift one token forward
    go_back = 1
    while end_positions[-1] is None:
      end_pos_idx = answers[i]['end_idx'] - go_back
      # Check if end_pos_idx is non-negative before calling char_to_token
      if end_pos_idx >= 0:
        end_positions[-1] = encodings.char_to_token(i, end_pos_idx)
      else:
        # Handle negative end_pos_idx, e.g., set to 0 or break
        end_positions[-1] = 0  # or break the loop
        break
      go_back += 1
  # update our encodings object with the new token-based start/end positions
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [37]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
# add_token_positions(test_encodings, test_answers)

In [38]:
class BioASQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = BioASQDataset(train_encodings)
val_dataset = BioASQDataset(val_encodings)
# test_dataset = BioASQDataset(test_encodings)

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
# !rm -rf /content/drive/MyDrive/models/deepset

In [41]:
VERSION = 3
LEARNING_RATE = 5e-5
BATCH_SIZE = 16
EPOCHS = 3
MODEL_SAVE_PATH = f"/content/drive/MyDrive/models/{MODEL_NAME}-lr{LEARNING_RATE}-epoch{EPOCHS}-v{VERSION}/"

In [42]:
transformers.utils.logging.set_verbosity_error()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [43]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
print(model.num_parameters())

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

124056578


In [44]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

# test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                               batch_size=BATCH_SIZE,
#                                               shuffle=False)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

start = timeit.default_timer()
for epoch in range(EPOCHS):
  model.train()
  train_running_loss = 0
  for idx, sample in enumerate(tqdm(train_dataloader, leave=True)):
    input_ids = sample['input_ids'].to(device)
    attention_mask = sample['attention_mask'].to(device)
    start_positions = sample['start_positions'].to(device)
    end_positions = sample['end_positions'].to(device)
    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions)

    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_running_loss += loss.item()

  train_loss = train_running_loss / (idx + 1)

  model.eval()
  val_running_loss = 0
  with torch.inference_mode():
    for idx, sample in enumerate(tqdm(val_dataloader)):
      input_ids = sample['input_ids'].to(device)
      attention_mask = sample['attention_mask'].to(device)
      start_positions = sample['start_positions'].to(device)
      end_positions = sample['end_positions'].to(device)
      outputs = model(input_ids=input_ids,
                      attention_mask=attention_mask,
                      start_positions=start_positions,
                      end_positions=end_positions)

      val_running_loss += outputs.loss.item()
    val_loss = val_running_loss / (idx + 1)

  print("-"*30)
  print(f"EPOCH: {epoch+1:02d} | Train Loss: {train_loss:.4f}")
  print(f"EPOCH: {epoch+1:02d} | Valid Loss: {val_loss:.4f}")
  print("-"*30)
  stop = timeit.default_timer()
  print(f"Training Time: {stop-start:.2f}s")

  model.save_pretrained(MODEL_SAVE_PATH)
  tokenizer.save_pretrained(MODEL_SAVE_PATH)

  torch.cuda.empty_cache()

100%|██████████| 557/557 [14:22<00:00,  1.55s/it]
100%|██████████| 62/62 [00:28<00:00,  2.15it/s]


------------------------------
EPOCH: 01 | Train Loss: 0.8564
EPOCH: 01 | Valid Loss: 0.8830
------------------------------
Training Time: 891.62s


100%|██████████| 557/557 [14:22<00:00,  1.55s/it]
100%|██████████| 62/62 [00:28<00:00,  2.15it/s]


------------------------------
EPOCH: 02 | Train Loss: 0.8083
EPOCH: 02 | Valid Loss: 0.8104
------------------------------
Training Time: 1785.62s


  9%|▉         | 49/557 [01:15<13:01,  1.54s/it]

In [42]:
preds = []
true = []
running_accuracy = []

model.eval()
with torch.inference_mode():
  for idx, sample in enumerate(tqdm(val_dataloader, leave=True)):
    input_ids = sample['input_ids'].to(device)
    attention_mask = sample['attention_mask'].to(device)
    start_positions = sample['start_positions']
    end_positions = sample['end_positions']

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1).cpu().detach()
    end_pred = torch.argmax(outputs['end_logits'], dim=1).cpu().detach()

    preds.extend([[int(i), int(j)] for i, j in zip(start_pred, end_pred)])
    true.extend([[int(i), int(j)] for i, j in zip(start_positions, end_positions)])

    running_accuracy.append(((start_pred == start_positions).sum()/len(start_positions)).item())
    running_accuracy.append(((end_pred == end_positions).sum()/len(end_positions)).item())

preds = [item for sublist in preds for item in sublist]
true = [item for sublist in true for item in sublist]

accuracy = sum(running_accuracy)/len(running_accuracy) # average accuracy
f1_value = f1_score(true, preds, average="macro")
print(f"\nAccuracy: {accuracy*100:.2f}% | F1 Score: {f1_value*100:.2f}%")

100%|██████████| 62/62 [00:28<00:00,  2.14it/s]


Accuracy: 62.28% | F1 Score: 43.94%





In [45]:
test_questions = []
test_context = []

for path in glob.glob('/content/datasets/QA/BioASQ/BioASQ-test-factoid-*'):
  with open(path) as f:
    test_data = json.load(f)
    for q in test_data['data'][0]['paragraphs']:
      test_questions.append(q['qas'][0]['question'])
      test_context.append(q['context'])

In [47]:
len(test_questions), len(test_context)

(2184, 2184)

In [50]:
test_questions[0], test_context[0]

('Which type of genes are modulated by SATB1?',
 "Repression of the genome organizer SATB1 in regulatory T cells is required for suppressive function and inhibition of effector differentiation. Regulatory T cells (T(reg) cells) are essential for self-tolerance and immune homeostasis. Lack of effector T cell (T(eff) cell) function and gain of suppressive activity by T(reg) cells are dependent on the transcriptional program induced by Foxp3. Here we report that repression of SATB1, a genome organizer that regulates chromatin structure and gene expression, was crucial for the phenotype and function of T(reg) cells. Foxp3, acting as a transcriptional repressor, directly suppressed the SATB1 locus and indirectly suppressed it through the induction of microRNAs that bound the SATB1 3' untranslated region. Release of SATB1 from the control of Foxp3 in T(reg) cells caused loss of suppressive function, establishment of transcriptional T(eff) cell programs and induction of T(eff) cell cytokines.

In [48]:
def model_inference(question, context, model_path=MODEL_SAVE_PATH):
  model = AutoModelForQuestionAnswering.from_pretrained(model_path)
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  start = timeit.default_timer()
  qa_model = pipeline(task="question-answering", model=model, tokenizer=tokenizer)
  stop = timeit.default_timer()
  print(f"Inference Time: {stop-start:.2f}s")
  return qa_model(question=question, context=context)

In [49]:
def display_result(context, result):
  before_text = context[:result['start']]
  colored_text = context[result['start']:result['end']+1]
  after_text = context[result['end']+1:]
  # print(f"Answer: {result['answer']}")
  display(HTML(f"""<p style='font-size: 16px; width: 50%;'>{before_text}
    <span style='background-color: #33447f; color: white; width: {len(result["answer"])}em;'>{colored_text}</span>
    {after_text}</p>"""))

In [67]:
random_idx = random.randint(0, len(test_questions)-1)
question = test_questions[random_idx]
context = test_context[random_idx]
print(question)
result = model_inference(question, context)
result

Which disease is treated with Fexinidazole?
Inference Time: 0.00s


{'score': 0.23135346174240112,
 'start': 113,
 'end': 129,
 'answer': 'trypanosomiasis.'}

In [68]:
display_result(context, result)