<a href="https://colab.research.google.com/github/RaIvPa/Improving-BERT-for-Biomedical-QA/blob/main/BERT_base_sentence_classification_with_BioASQ_preprocessing_BioBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets transformers[sentencepiece]
!pip install py-rouge
!pip install torch

import transformers
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from datasets import load_dataset, load_metric
import numpy as np
import csv
from csv import reader
import json
from rouge import Rouge
import nltk
from torch import nn
from transformers.modeling_tf_utils import get_initializer
from tensorflow.keras.layers import Dense, Activation

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [33]:
metric = Rouge(metrics=["rouge-n"], max_n=2)

checkpoint = "dmis-lab/biobert-base-cased-v1.2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
up_model = TFAutoModel.from_pretrained(checkpoint, from_pt=True, num_labels=2, output_hidden_states=True)
input_layer = tf.keras.layers.Input(shape=(up_model.config.hidden_size,), dtype=tf.int64)
up_model = up_model(input_layer)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint

In [34]:
# model.layers[0].pooler.dense.activation = tf.keras.activations.linear

output = Dense(2, activation=tf.keras.activations.sigmoid)(up_model["hidden_states"][0])
model = tf.keras.Model(inputs=input_layer, outputs=output)

In [4]:
def create_training_dataset(csv_name, data_file_name, data_num):  
  csv_file = open(csv_name, 'w')
  data_file = open(data_file_name, 'r')

  data_load = json.load(data_file)
  csv_write = csv.writer(csv_file)
  csv_write.writerow(["question", "snippet", "rouge-2", "label"])

  rouge_score_get = Rouge(metrics=["rouge-n"], max_n=2)
    
  rows = []
  top_n = 5
  question_num = 0
  for question_set in data_load["questions"]:
    if question_num <= data_num:
      sub_rows = []
      question = question_set["body"]
      top = []
      top_id = {}
      snip_num = 0
      chosen = 0
      for snippet in question_set["snippets"]:
          snip_text = snippet["text"]
          rouge_score = rouge_score_get.get_scores(question, snip_text)
          rouge_2_f = rouge_score["rouge-2"]["f"]
          top.append(rouge_2_f)
          sub_rows.append([question, snip_text, rouge_2_f])
        
      top_indx = sorted(range(len(top)), key=lambda i: top[i], reverse=True)[:5]

      for indx in top_indx:
        sub_rows[indx].append(1)

      for row in sub_rows:
        if len(row) < 4:
          row.append(0)

      rows.append(sub_rows)
    question_num += 1

  for final_row in rows:
    csv_write.writerows(final_row)

  csv_file.close()
  data_file.close()

In [5]:
def create_validation_dataset(csv_name, data_file_name, data_num):  
  csv_file = open(csv_name, 'w')
  data_file = open(data_file_name, 'r')

  data_load = json.load(data_file)
  csv_write = csv.writer(csv_file)
  csv_write.writerow(["question", "snippet", "rouge-2", "label"])

  rouge_score_get = Rouge(metrics=["rouge-n"], max_n=2)
    
  rows = []
  top_n = 5
  question_num = 0
  for question_set in data_load["questions"]:
    if question_num > data_num and question_num <= data_num*2:
      sub_rows = []
      question = question_set["body"]
      top = []
      top_id = {}
      snip_num = 0
      chosen = 0
      for snippet in question_set["snippets"]:
          snip_text = snippet["text"]
          rouge_score = rouge_score_get.get_scores(question, snip_text)
          rouge_2_f = rouge_score["rouge-2"]["f"]
          top.append(rouge_2_f)
          sub_rows.append([question, snip_text, rouge_2_f])
        
      top_indx = sorted(range(len(top)), key=lambda i: top[i], reverse=True)[:5]

      for indx in top_indx:
        sub_rows[indx].append(1)

      for row in sub_rows:
        if len(row) < 4:
          row.append(0)

      rows.append(sub_rows)
    question_num += 1

  for final_row in rows:
    csv_write.writerows(final_row)

  csv_file.close()
  data_file.close()

In [6]:
def create_test_dataset(csv_name, data_file_name):  
  csv_file = open(csv_name, 'w')
  data_file = open(data_file_name, 'r')

  data_load = json.load(data_file)
  csv_write = csv.writer(csv_file)
  csv_write.writerow(["question", "snippet", "rouge-2", "label"])

  rouge_score_get = Rouge(metrics=["rouge-n"], max_n=2)
    
  rows = []
  top_n = 5
  for question_set in data_load["questions"]:
    sub_rows = []
    question = question_set["body"]
    top = []
    top_id = {}
    snip_num = 0
    chosen = 0
    for snippet in question_set["snippets"]:
        snip_text = snippet["text"]
        rouge_score = rouge_score_get.get_scores(question, snip_text)
        rouge_2_f = rouge_score["rouge-2"]["f"]
        top.append(rouge_2_f)
        sub_rows.append([question, snip_text, rouge_2_f])
      
    top_indx = sorted(range(len(top)), key=lambda i: top[i], reverse=True)[:5]

    for indx in top_indx:
      sub_rows[indx].append(1)

    for row in sub_rows:
      if len(row) < 4:
        row.append(0)

    rows.append(sub_rows)

  for final_row in rows:
    csv_write.writerows(final_row)

  csv_file.close()
  data_file.close()

In [7]:
def create_ideal_answer_training(data_file_name, data_num):  
  data_file = open(data_file_name, 'r')

  data_load = json.load(data_file)

  question_answers = []
  question_num = 0
  
  for question_set in data_load["questions"]:
    if question_num <= data_num:
      question_answers.append(question_set["ideal_answer"])
    question_num += 1

  data_file.close()
  return question_answers

In [8]:
def create_ideal_answer_validation(data_file_name, data_num):  
  data_file = open(data_file_name, 'r')

  data_load = json.load(data_file)
    
  question_answers = []
  question_num = 0

  for question_set in data_load["questions"]:
    if question_num > data_num:
      question_answers.append(question_set["ideal_answer"])
    question_num += 1

  data_file.close()
  return question_answers

CSV column format is


*   Question
*   Snippet
*   ROUGE score
*   Label




In [9]:

nltk.download("punkt")

dataset_split_num = 250
train_path = '/content/drive/MyDrive/Thesis/training8b.json'
validation_path = '/content/drive/MyDrive/Thesis/training8b.json'
test_path = "/content/drive/MyDrive/Thesis/8B3_golden.json"
create_training_dataset('training_data.csv', train_path, dataset_split_num)
create_validation_dataset('validation_data.csv', validation_path, dataset_split_num)
create_test_dataset('test_data.csv', test_path)
ideal_train = create_ideal_answer_training(train_path, dataset_split_num)
ideal_val = create_ideal_answer_validation(validation_path, dataset_split_num)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
datasets = load_dataset('csv', data_files={"train": 'training_data.csv', 
                                           "validation": "validation_data.csv",
                                           "test": "test_data.csv"
                                           })
print(datasets)
print(ideal_train)
print(ideal_val)
print(datasets["train"]["label"])

Using custom data configuration default-9d5f625382208792


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-9d5f625382208792/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-9d5f625382208792/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'snippet', 'rouge-2', 'label'],
        num_rows: 3829
    })
    validation: Dataset({
        features: ['question', 'snippet', 'rouge-2', 'label'],
        num_rows: 3510
    })
    test: Dataset({
        features: ['question', 'snippet', 'rouge-2', 'label'],
        num_rows: 975
    })
})
[["Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model."], ['The 7 known EGFR ligands  are: epidermal growth factor (EGF), betacellulin (BTC), epiregulin (EPR), heparin-binding EGF (HB-EGF), transforming growth factor-α [TGF-α], amphiregulin (AREG) and epigen (EPG).'], ['Yes,  papil

In [11]:
def tokenize_dataset(dataset):
  encoded = tokenizer(
      dataset["question"],
      dataset["snippet"],
      padding=True,
      truncation=True,
      return_tensors="np",
  )
  return encoded.data

In [12]:
tok_data = {
    split: tokenize_dataset(datasets[split]) for split in datasets.keys()
}

# tok_data = tokenize_dataset(datasets["train"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
print(tok_data)

{'train': {'input_ids': array([[  101,  1110, 20844, ...,     0,     0,     0],
       [  101,  1110, 20844, ...,     0,     0,     0],
       [  101,  1110, 20844, ...,     0,     0,     0],
       ...,
       [  101,  1674,  4968, ...,     0,     0,     0],
       [  101,  1134, 25128, ...,     0,     0,     0],
       [  101,  1134, 25128, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}, 'validation': {'input_ids': array([[ 101, 5250, 1566, ...,    0,    0,    0],
       [ 101, 5250, 1566, ...,    0,    0,    0],
       [ 101, 5250, 1566, ...,    0,    0,    0],

In [14]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

In [15]:
bat_size = 8
epoch_num = 1

train_steps = (len(tok_data["train"]["input_ids"]) // bat_size) * epoch_num
lr_scheduler = PolynomialDecay(
    initial_learning_rate=1e-4,
    end_learning_rate=0.,
    decay_steps=train_steps
    )

In [16]:
from tensorflow.keras.optimizers import Adam

In [17]:
opt = Adam(learning_rate=lr_scheduler)

In [18]:
from tensorflow.keras.losses import BinaryCrossentropy, SparseCategoricalCrossentropy, MeanSquaredError
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras import Input

In [35]:
from tqdm.keras import TqdmCallback

model.compile(optimizer=opt, loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
# model.compile(optimizer=opt, loss=MeanSquaredError(), metrics=["accuracy"])

model.fit(
    tok_data["train"]["input_ids"],
    np.array(datasets["train"]["label"]),
    validation_data=(
        tok_data["validation"]["input_ids"],
        np.array(datasets["validation"]["label"])
    ),
    validation_batch_size=bat_size,
    batch_size=bat_size,
    verbose=0,
    callbacks=[TqdmCallback(verbose=2)],
    epochs=epoch_num,
    # steps_per_epoch=1 # for debugging
)

0epoch [00:00, ?epoch/s]

  0%|          | 0.00/479 [00:00<?, ?batch/s]



  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '




InvalidArgumentError: ignored

Linear Regression - BERT base model (with only training data)

479/479 batches

loss: 4.4516

accuracy: 0.5377

Logistic Regression - BERT base model (with only training data) (accuracy starts to go down past 370)


479/479 batches

loss: 0.6272

accuracy: 0.6929

adding validation data seems to cause problems with the input layer for linear approach (not the case, see below)

loss seems to go nan when adding validation data (problem fixed; len function in training steps wasn't accounting the fact that there were now two datasets in tok_data)

fitting code seems to keep executing past the first 479 steps, but doesn't show more fitting output (might be because of epoch number discrepancies between the scheduler and fit function)

In [None]:
# for each question:
  # count snippets
  # for each snippet:
    # check if it is in top 3 likelihood
  # concatenate top 3 snippets together to make summary
  # measure rouge-2 f score between summary and ideal answer
# show overall rouge-2 f score between summaries and ideal answers

# for each question:
  # ideal answer, snippets, snippet rouge-2 scores


#todo - make ideal answer scraper

# model.predict

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
# data = "validation"
# predictions = model.predict(tok_data[data])
train_predictions = model.predict(tok_data["train"])


ResourceExhaustedError: ignored

In [None]:
validation_predictions = model.predict(tok_data["validation"])

In [None]:
def combine_for_ranking(dataset, set_predictions):
  curr_q = ""
  q_lists = {}
  track = 0
  for q, snippet in zip(datasets[dataset]["question"], datasets[dataset]["snippet"]):
      logits = set_predictions["logits"][track]
      # print(track)
      track+=1
      if q != curr_q:
        curr_q = q
        q_lists[q] = []
        q_lists[q].append([snippet])
        q_lists[q].append([logits])
      elif q == curr_q:
        q_lists[q][0].append(snippet)
        q_lists[q][1].append(logits)

  track = 0
  return q_lists

In [None]:
def get_top_n(q_list, top_n):
  curr_top_3_indicies = []
  summaries = []

  for question in q_list:
    for idx, logits in enumerate(q_list[question][1], 0):
      if len(curr_top_3_indicies) < top_n:
        curr_top_3_indicies.append(idx)
      else:
        lowest = 0
        for search_idx, val in enumerate(curr_top_3_indicies):
          if q_list[question][1][val][1] > q_list[question][1][curr_top_3_indicies[lowest]][1]:
            lowest = search_idx
        if logits[1] > q_list[question][1][curr_top_3_indicies[lowest]][1]:
          curr_top_3_indicies[lowest] = idx
    
    to_append = ""
    while len(curr_top_3_indicies) != 0:
      highest_idx = 0
      for idx_idx, item in enumerate(curr_top_3_indicies):
        if q_list[question][1][item][1] > q_list[question][1][curr_top_3_indicies[highest_idx]][1]:
          highest_idx = idx_idx
      to_append = to_append + q_list[question][0][curr_top_3_indicies[highest_idx]] + " "
      curr_top_3_indicies.pop(highest_idx)
    summaries.append(to_append)
    curr_top_3_indicies.clear()

  return summaries
  # remove abs

In [None]:
train_rank = combine_for_ranking("train", train_predictions)
validation_rank = combine_for_ranking("validation", validation_predictions)
train_hypothesis = get_top_n(train_rank, 3)
validation_hypothesis = get_top_n(validation_rank, 3)

In [None]:
results = metric._get_scores_rouge_n(validation_hypothesis, ideal_val)
print(results)

In [None]:
def get_results_json(initial_json_file, results_json_name, results):
  data_load = json.load(open(initial_json_file, "r"))
  results_json = open(results_json_name, "w")

  for question, result in zip(data_load["questions"], results):
    question["ideal_answer"] = result

  json.dump(obj=data_load, fp=results_json, indent=4)
  results_json.close()

Training and validation

In [None]:
all_hypothesis = []
for item in train_hypothesis:
  all_hypothesis.append(item)

for item in validation_hypothesis:
  all_hypothesis.append(item)

get_results_json(train_path, "results.json", all_hypothesis)

Training and validation

In [None]:
test_predictions = model.predict(tok_data["test"])



In [None]:
test_rank = combine_for_ranking("test", test_predictions)

In [None]:
test_hypothesis = get_top_n(test_rank, 3)

In [None]:
def get_test_results_json(initial_json_file, results_json_name, results):
  data_load = json.load(open(initial_json_file, "r"))
  results_json = open(results_json_name, "w")

  for question, result in zip(data_load["questions"], results):
    if question["type"] == "yesno":
      question["exact_answer"] = "yes" 
      question["ideal_answer"] = result
    elif question["type"] != "summary":
      question["exact_answer"] = "" 
      question["ideal_answer"] = result
    elif question["type"] == "summary":
      question["ideal_answer"] = result
  
  json.dump(obj=data_load, fp=results_json, indent=4)
  results_json.close()

In [None]:
get_test_results_json(test_path, "test_results.json", test_hypothesis)

issue with compute function not taking the validation label list due to type mismatch: expected bytes, got int