In [1]:
!pip install -q transformers

[0m

In [2]:
from transformers import AutoModel

pretrained = 'dmis-lab/biobert-large-cased-v1.1-squad'
model = AutoModel.from_pretrained(pretrained)


Some weights of BertModel were not initialized from the model checkpoint at dmis-lab/biobert-large-cased-v1.1-squad and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from absl import flags

flags.DEFINE_string("bert_config_file", None, "config json file corresponding to the pretrained BERT model")
flags.DEFINE_string("vocab_File", None, "vocab file on which BERT was trained")
flags.DEFINE_string("output_dir", None, "output dir for model checkpoints")
flags.DEFINE_string("train_file", None, "squad-formatted json for training. E.g., train-v1.1.json")
flags.DEFINE_string("predict_file", None, "squad-formatted json for predictions. E.g., dev(test)-v1.1.json")
flags.DEFINE_string("init_checkpoint", None, "initial checkpoint (usually from a pretrained BERT model)")

flags.DEFINE_bool("do_lower_case", True, "whether to lower-case input text. True for lower case.")
flags.DEFINE_bool("do_train", True, "whether to run training")
flags.DEFINE_bool("do_predict", True, "whether to run eval on the dev set")
flags.DEFINE_bool("use_tpu", False, "whether to use TPU or GPU/CPU")
flags.DEFINE_bool("verbose_logging", False, "whether to print all warnings during data processing. Other warnings are printed by default.")

flags.DEFINE_integer("max_seq_len", 384, "maximum input sequence after WordPiece tokenization. Will be padded if shorter, truncated if longer.")
flags.DEFINE_integer("doc_stride", 128, "when splitting up a long document into chunks, how much stride to take between chunks")
flags.DEFINE_integer("max_query_len", 64, "maximum query sequence after WordPiece tokenization. Will be truncated if longer.")
flags.DEFINE_integer("predict_batch_size", 8, "total batch size for predictions")
flags.DEFINE_integer("save_checkpoint_step", 1000, "how often to save model checkpoints")
flags.DEFINE_integer("iterations_per_loop", 1000, "how many steps to make in each estimator call")
flags.DEFINE_integer("n_best_size", 20, "how many n-best predictions to generate in nbest_predictions.json output file")
flags.DEFINE_integer("max_answer_len", 30, "maximum length of generated answer")

flags.DEFINE_float("learning_rate", 5e-5, "initial learning rate for Adam optimizer")
flags.DEFINE_float("num_train_epochs", 3.0, "number of training epochs")
flags.DEFINE_float("warmup_proportion", 0.1, "proportion of training for linear lr warmup. E.g., 0.1 refers to 10% of training")
flags.DEFINE_float("null_score_diff_threshold", 0.0, "to predict null if (null_score - best_non_null) > threshold")

<absl.flags._flagvalues.FlagHolder at 0x7acdaff90e20>

In [6]:
import sys
sys.path.insert(1, '/content/drive/MyDrive/biobert')

In [10]:
import tokenization

class SquadExample(object):
  def __init__(self, qas_id, question_text, doc_tokens, origin_answer_text=None, start_pos=None, end_pos=None, is_impossible=False):
    self.qas_id = qas_id
    self.question_text = question_text
    self.doc_tokens = doc_tokens
    self.origin_answer_text = origin_answer_text
    self.start_pos = start_pos
    self.end_pos = end_pos
    self.is_impossible = is_impossible

  def __str__(self):
    return self.__repr__()

  def __repr__(self):
    s = f"qas_id: {tokenization.printable_text(self.qas_id)}" # check if text is in str, bytes or unicode and convert
    s += f", question_text: {tokenization.printable_text(self.question_text)}"
    s += f", doc_tokens: {[' '.join(self.doc_tokens)]}"

    if self.start_pos:
      s += f", start_pos: {self.start_pos}, end_pos: {self.end_pos}, is_impossible: {self.is_impossible}"

    return s



In [23]:
import json
import tensorflow as tf

class InputFeatures(object):
  def __init(self, unique_id, example_index, doc_span_index, tokens, token_to_origin_map, token_is_max_context,
             input_ids, input_mask, segment_ids, start_pos=None, end_pos=None, is_impossible=None):
    self.unique_id = unique_id
    self.example_index = example_index
    self.doc_span_index = doc_span_index
    self.tokens = tokens
    self.token_to_origin_map = token_to_origin_map
    self.token_is_max_context = token_is_max_context
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.start_pos = start_pos
    self.end_pos = end_pos
    self.is_impossible = is_impossible

  def read_squad_examples(input_file, is_training):
    """Read a squad json file into a list of SquadExample"""

    def is_whitespace(c):
      if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
      else:
        return False

    with open(input_file, "r") as rf:
      input_data = json.load(rf)["data"][0]["paragraphs"]

    examples = []
    for entry in input_data:
      doc_tokens = []
      char_to_word_offset = []
      prev_is_whitespace = True

      context = entry["context"]
      for c in context:
        if is_whitespace(c):
          prev_is_whitespace=True
        else:
          if prev_is_whitespace: # c comes after whitespace -> add as new token
            doc_tokens.append(c)
          else:
            doc_tokens[-1] += c # c comes after c -> attach to recent token
          prev_is_whitespace=False
        char_to_word_offset.append(len(doc_tokens) -1) # "I went home" -> [0,0,1,1,1,1,1,2,2,2,2]

      qas = entry["qas"]
      for qa in qas:
        qas_id = qa["id"]
        question_text = qa["question"]
        start_pos = None
        end_pos = None
        origin_answer_text = None
        is_impossible = False

        if is_training:
          if len(qa["answers"]) != 1:
            raise ValueError("Each question should have exactly 1 answer.")

          answer = qa["answers"][0]
          origin_answer_text = answer["text"] #"Bazex syndrome"
          answer_offset = answer["answer_start"] #93
          answer_length = len(origin_answer_text)
          start_pos = char_to_word_offset[answer_offset] # 몇번째 단어부터
          end_pos = char_to_word_offset[answer_offset + answer_length-1] # 몇번째 단어까지

          # 정답이 context로부터 extract한게 맞는지 확인 (아닌 경우 건너뜀)
          answer_from_context = " ".join(doc_tokens[start_pos:end_pos+1]) # context에서 주어진 범위로 인덱싱한 정답
          cleaned_answer = " ".join(tokenization.whitespace_tokenize(origin_answer_text)) # 실제 정답

          if answer_from_context.find(cleaned_answer) == -1: # context에 정답이 들어있지 않은 경우
            tf.logging.warning(f"Could not find answer from context. {answer_from_context} vs {cleaned_answer}")
            continue

        else: # inference용
          start_pos = -1
          end_pos = -1
          origin_answer_text = ""

        example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens,
                               origin_answer_text=origin_answer_text, start_pos=start_pos, end_pos=end_pos,
                               is_impossible=is_impossible)
        examples.append(example)

    return examples






In [21]:
def is_whitespace(c):
  if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
    return True
  else:
    return False
prev_is_whitespace=True
doc_tokens = []
char_to_word_offset = []
ex = "Psoriasiform dermatitis in a case of newly diagnosed locally advanced pyriform sinus tumour: Bazex syndrome revisited. Acrokeratosis paraneoplastica of Bazex is a rare but important paraneoplastic dermatosis, usually manifesting as psoriasiform rashes over the acral sites. It often precedes diagnosis of the associated malignancy, usually that of upper aerodigestive tract squamous cell carcinoma. We present the case of a patient with a newly diagnosed pyriform sinus tumour and associated acrokeratosis paraneoplastica. To the best of our knowledge, this is the first reported case in the local literature."
for c in ex:
  if is_whitespace(c):
    prev_is_whitespace=True
  else:
    if prev_is_whitespace: # c comes after whitespace -> add as new token
      doc_tokens.append(c)
    else:
      doc_tokens[-1] += c # c comes after c -> attach to recent token
    prev_is_whitespace=False
  char_to_word_offset.append(len(doc_tokens) -1)

In [22]:
doc_tokens

['Psoriasiform',
 'dermatitis',
 'in',
 'a',
 'case',
 'of',
 'newly',
 'diagnosed',
 'locally',
 'advanced',
 'pyriform',
 'sinus',
 'tumour:',
 'Bazex',
 'syndrome',
 'revisited.',
 'Acrokeratosis',
 'paraneoplastica',
 'of',
 'Bazex',
 'is',
 'a',
 'rare',
 'but',
 'important',
 'paraneoplastic',
 'dermatosis,',
 'usually',
 'manifesting',
 'as',
 'psoriasiform',
 'rashes',
 'over',
 'the',
 'acral',
 'sites.',
 'It',
 'often',
 'precedes',
 'diagnosis',
 'of',
 'the',
 'associated',
 'malignancy,',
 'usually',
 'that',
 'of',
 'upper',
 'aerodigestive',
 'tract',
 'squamous',
 'cell',
 'carcinoma.',
 'We',
 'present',
 'the',
 'case',
 'of',
 'a',
 'patient',
 'with',
 'a',
 'newly',
 'diagnosed',
 'pyriform',
 'sinus',
 'tumour',
 'and',
 'associated',
 'acrokeratosis',
 'paraneoplastica.',
 'To',
 'the',
 'best',
 'of',
 'our',
 'knowledge,',
 'this',
 'is',
 'the',
 'first',
 'reported',
 'case',
 'in',
 'the',
 'local',
 'literature.']