In [1]:
import os
import shutil
import logging
import nltk
import collections
import unicodedata

import sys
import json
import random
import sentencepiece as spm

import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
from tensorflow.keras.utils import Progbar

import tensorflow_hub as hub
from transformers import BertTokenizer

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
RANDOM_SEED = 100

In [5]:
# LANG_CODE = "en" #@param {type:"string"}

# !wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.'$LANG_CODE'.gz -O dataset.txt.gz
# !gzip -d dataset.txt.gz
# !tail dataset.txt

'wget' is not recognized as an internal or external command,
operable program or batch file.
'gzip' is not recognized as an internal or external command,
operable program or batch file.
'tail' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# DEMO_MODE = True #@param {type:"boolean"}

# if DEMO_MODE:
#   CORPUS_SIZE = 1000000
# else:
#   CORPUS_SIZE = 100000000 #@param {type: "integer"}
  
# !(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
# !mv subdataset.txt dataset.txt

In [34]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [36]:
RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"}
PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"}

# apply normalization to the dataset
# this will take a minute or two
total_line = 5000000
# total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_line)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for i, l in enumerate(fi):
      fo.write(normalize_text(l)+"\n")
      bar.add(1)
      if i == total_line:
        break



In [37]:
MODEL_PREFIX = "tokenizer"
VOC_SIZE = 32000
SUBSAMPLE_SIZE = 12800000
NUM_PLACEHOLDERS = 256




In [38]:
SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)
spm.SentencePieceTrainer.Train(SPM_COMMAND)

In [39]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

Learnt vocab size: 31743
Sample tokens: ['▁beginner', '▁fault', '▁experiment', 'inian', '▁clumsi', 'uigg', '▁measure', 'dark', '▁southwest', '▁rapper']


In [40]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

In [41]:
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

In [42]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

32000


In [43]:
VOC_FNAME = "vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

# 학습 데이터 쪼개기

In [16]:
import tokenization

In [None]:
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

# 데이터 전처리 (create_pretraining_data)

XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 bert/create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

tf.gfile.MkDir(PRETRAINING_DIR)


In [14]:
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}
DUPE_FACTOR = 10
SHROT_SEQ_PROB = 0.1

In [17]:
tokenizer = tokenization.FullTokenizer(
    vocab_file=VOC_FNAME, do_lower_case=True)

In [18]:
input_pattern = "./shards/*"
input_files = []
input_files.extend(tf.io.gfile.glob(input_pattern))

In [19]:
logging.info("*** Reading from input files ***")
for input_file in input_files:
    tf.compat.v1.logging.info("  %s", input_file)


INFO:root:*** Reading from input files ***
2021-03-24 06:41:33,224 :    .\shards\shard_0000
INFO:tensorflow:  .\shards\shard_0000
2021-03-24 06:41:33,225 :    .\shards\shard_0001
INFO:tensorflow:  .\shards\shard_0001
2021-03-24 06:41:33,226 :    .\shards\shard_0002
INFO:tensorflow:  .\shards\shard_0002
2021-03-24 06:41:33,226 :    .\shards\shard_0003
INFO:tensorflow:  .\shards\shard_0003
2021-03-24 06:41:33,227 :    .\shards\shard_0004
INFO:tensorflow:  .\shards\shard_0004
2021-03-24 06:41:33,228 :    .\shards\shard_0005
INFO:tensorflow:  .\shards\shard_0005
2021-03-24 06:41:33,229 :    .\shards\shard_0006
INFO:tensorflow:  .\shards\shard_0006
2021-03-24 06:41:33,231 :    .\shards\shard_0007
INFO:tensorflow:  .\shards\shard_0007
2021-03-24 06:41:33,232 :    .\shards\shard_0008
INFO:tensorflow:  .\shards\shard_0008
2021-03-24 06:41:33,233 :    .\shards\shard_0009
INFO:tensorflow:  .\shards\shard_0009
2021-03-24 06:41:33,234 :    .\shards\shard_0010
INFO:tensorflow:  .\shards\shard_0010


In [21]:
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
  """Create `TrainingInstance`s from raw text."""
  all_documents = [[]]

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    with tf.io.gfile.GFile(input_file, "r") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break
        line = line.strip()

        # Empty lines are used as document delimiters
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line)
        if tokens:
          all_documents[-1].append(tokens)

  # Remove empty documents
  all_documents = [x for x in all_documents if x]
  rng.shuffle(all_documents)

  vocab_words = list(tokenizer.vocab.keys())
  instances = []
  for _ in range(dupe_factor):
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

  rng.shuffle(instances)
  return instances

def create_instances_from_document(
    all_documents, document_index, max_seq_length, short_seq_prob,
    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
  """Creates `TrainingInstance`s for a single document."""
  document = all_documents[document_index]

  # Account for [CLS], [SEP], [SEP]
  max_num_tokens = max_seq_length - 3

  # We *usually* want to fill up the entire sequence since we are padding
  # to `max_seq_length` anyways, so short sequences are generally wasted
  # computation. However, we *sometimes*
  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
  # sequences to minimize the mismatch between pre-training and fine-tuning.
  # The `target_seq_length` is just a rough target however, whereas
  # `max_seq_length` is a hard limit.
  target_seq_length = max_num_tokens
  if rng.random() < short_seq_prob:
    target_seq_length = rng.randint(2, max_num_tokens)

  # We DON'T just concatenate all of the tokens from a document into a long
  # sequence and choose an arbitrary split point because this would make the
  # next sentence prediction task too easy. Instead, we split the input into
  # segments "A" and "B" based on the actual "sentences" provided by the user
  # input.
  instances = []
  current_chunk = []
  current_length = 0
  i = 0
  while i < len(document):
    segment = document[i]
    current_chunk.append(segment)
    current_length += len(segment)
    if i == len(document) - 1 or current_length >= target_seq_length:
      if current_chunk:
        # `a_end` is how many segments from `current_chunk` go into the `A`
        # (first) sentence.
        a_end = 1
        if len(current_chunk) >= 2:
          a_end = rng.randint(1, len(current_chunk) - 1)

        tokens_a = []
        for j in range(a_end):
          tokens_a.extend(current_chunk[j])

        tokens_b = []
        # Random next
        is_random_next = False
        if len(current_chunk) == 1 or rng.random() < 0.5:
          is_random_next = True
          target_b_length = target_seq_length - len(tokens_a)

          # This should rarely go for more than one iteration for large
          # corpora. However, just to be careful, we try to make sure that
          # the random document is not the same as the document
          # we're processing.
          for _ in range(10):
            random_document_index = rng.randint(0, len(all_documents) - 1)
            if random_document_index != document_index:
              break

          random_document = all_documents[random_document_index]
          random_start = rng.randint(0, len(random_document) - 1)
          for j in range(random_start, len(random_document)):
            tokens_b.extend(random_document[j])
            if len(tokens_b) >= target_b_length:
              break
          # We didn't actually use these segments so we "put them back" so
          # they don't go to waste.
          num_unused_segments = len(current_chunk) - a_end
          i -= num_unused_segments
        # Actual next
        else:
          is_random_next = False
          for j in range(a_end, len(current_chunk)):
            tokens_b.extend(current_chunk[j])
        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)

        assert len(tokens_a) >= 1
        assert len(tokens_b) >= 1

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
          tokens.append(token)
          segment_ids.append(0)

        tokens.append("[SEP]")
        segment_ids.append(0)

        for token in tokens_b:
          tokens.append(token)
          segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

        (tokens, masked_lm_positions,
         masked_lm_labels) = create_masked_lm_predictions(
             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
        instance = TrainingInstance(
            tokens=tokens,
            segment_ids=segment_ids,
            is_random_next=is_random_next,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)
        instances.append(instance)
      current_chunk = []
      current_length = 0
    i += 1

  return instances

MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])

def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng):
  """Creates the predictions for the masked LM objective."""

  cand_indexes = []
  do_whole_word_mask = False
  for (i, token) in enumerate(tokens):
    if token == "[CLS]" or token == "[SEP]":
      continue
    # Whole Word Masking means that if we mask all of the wordpieces
    # corresponding to an original word. When a word has been split into
    # WordPieces, the first token does not have any marker and any subsequence
    # tokens are prefixed with ##. So whenever we see the ## token, we
    # append it to the previous set of word indexes.
    #
    # Note that Whole Word Masking does *not* change the training code
    # at all -- we still predict each WordPiece independently, softmaxed
    # over the entire vocabulary.
    if (do_whole_word_mask and len(cand_indexes) >= 1 and
        token.startswith("##")):
      cand_indexes[-1].append(i)
    else:
      cand_indexes.append([i])

  rng.shuffle(cand_indexes)

  output_tokens = list(tokens)

  num_to_predict = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))

  masked_lms = []
  covered_indexes = set()
  for index_set in cand_indexes:
    if len(masked_lms) >= num_to_predict:
      break
    # If adding a whole-word mask would exceed the maximum number of
    # predictions, then just skip this candidate.
    if len(masked_lms) + len(index_set) > num_to_predict:
      continue
    is_any_index_covered = False
    for index in index_set:
      if index in covered_indexes:
        is_any_index_covered = True
        break
    if is_any_index_covered:
      continue
    for index in index_set:
      covered_indexes.add(index)

      masked_token = None
      # 80% of the time, replace with [MASK]
      if rng.random() < 0.8:
        masked_token = "[MASK]"
      else:
        # 10% of the time, keep original
        if rng.random() < 0.5:
          masked_token = tokens[index]
        # 10% of the time, replace with random word
        else:
          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]

      output_tokens[index] = masked_token

      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
  assert len(masked_lms) <= num_to_predict
  masked_lms = sorted(masked_lms, key=lambda x: x.index)

  masked_lm_positions = []
  masked_lm_labels = []
  for p in masked_lms:
    masked_lm_positions.append(p.index)
    masked_lm_labels.append(p.label)

  return (output_tokens, masked_lm_positions, masked_lm_labels)


def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
  """Truncates a pair of sequences to a maximum sequence length."""
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_num_tokens:
      break

    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
    assert len(trunc_tokens) >= 1

    # We want to sometimes truncate from the front and sometimes from the
    # back to add more randomness and avoid biases.
    if rng.random() < 0.5:
      del trunc_tokens[0]
    else:
      trunc_tokens.pop()

class TrainingInstance(object):
  """A single training instance (sentence pair)."""

  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels

  def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s

  def __repr__(self):
    return self.__str__()

In [22]:
rng = random.Random(RANDOM_SEED)
instances = create_training_instances(
    input_files, tokenizer, MAX_SEQ_LENGTH, DUPE_FACTOR,
    SHROT_SEQ_PROB, MASKED_LM_PROB, MAX_PREDICTIONS,
    rng)


In [24]:
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files):
  """Create TF example files from `TrainingInstance`s."""
  writers = []
  for output_file in output_files:
    writers.append(tf.io.TFRecordWriter(output_file))

  writer_index = 0

  total_written = 0
  for (inst_index, instance) in enumerate(instances):
    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = list(instance.segment_ids)
    assert len(input_ids) <= max_seq_length

    while len(input_ids) < max_seq_length:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    masked_lm_positions = list(instance.masked_lm_positions)
    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
    masked_lm_weights = [1.0] * len(masked_lm_ids)

    while len(masked_lm_positions) < max_predictions_per_seq:
      masked_lm_positions.append(0)
      masked_lm_ids.append(0)
      masked_lm_weights.append(0.0)

    next_sentence_label = 1 if instance.is_random_next else 0

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
    features["next_sentence_labels"] = create_int_feature([next_sentence_label])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))

    writers[writer_index].write(tf_example.SerializeToString())
    writer_index = (writer_index + 1) % len(writers)

    total_written += 1

    if inst_index < 20:
      logging.info("*** Example ***")
      logging.info("tokens: %s" % " ".join(
          [tokenization.printable_text(x) for x in instance.tokens]))

      for feature_name in features.keys():
        feature = features[feature_name]
        values = []
        if feature.int64_list.value:
          values = feature.int64_list.value
        elif feature.float_list.value:
          values = feature.float_list.value
        logging.info(
            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))

  for writer in writers:
    writer.close()

  logging.info("Wrote %d total instances", total_written)

def create_int_feature(values):
  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
  return feature


def create_float_feature(values):
  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
  return feature

In [26]:
TRAIN_FILE_NUMS = 48 # 6 * 8
VAL_FILE_NUMS = 6
output_files = [f"{PRETRAINING_DIR}/train_{i:03d}.tfrecord" for i in range(TRAIN_FILE_NUMS)]
output_files += [f"{PRETRAINING_DIR}/validate_{i:03d}.tfrecord" for i in range(VAL_FILE_NUMS)]
logging.info("*** Writing to output files ***")
for output_file in output_files:
    logging.info("  %s", output_file)
write_instance_to_example_files(instances, tokenizer, MAX_SEQ_LENGTH,
                                  MAX_PREDICTIONS, output_files)


INFO:root:*** Writing to output files ***
INFO:root:  pretraining_data/train_000.tfrecord
INFO:root:  pretraining_data/train_001.tfrecord
INFO:root:  pretraining_data/train_002.tfrecord
INFO:root:  pretraining_data/train_003.tfrecord
INFO:root:  pretraining_data/train_004.tfrecord
INFO:root:  pretraining_data/train_005.tfrecord
INFO:root:  pretraining_data/train_006.tfrecord
INFO:root:  pretraining_data/train_007.tfrecord
INFO:root:  pretraining_data/train_008.tfrecord
INFO:root:  pretraining_data/train_009.tfrecord
INFO:root:  pretraining_data/train_010.tfrecord
INFO:root:  pretraining_data/train_011.tfrecord
INFO:root:  pretraining_data/train_012.tfrecord
INFO:root:  pretraining_data/train_013.tfrecord
INFO:root:  pretraining_data/train_014.tfrecord
INFO:root:  pretraining_data/train_015.tfrecord
INFO:root:  pretraining_data/train_016.tfrecord
INFO:root:  pretraining_data/train_017.tfrecord
INFO:root:  pretraining_data/train_018.tfrecord
INFO:root:  pretraining_data/train_019.tfrecor

INFO:root:masked_lm_positions: 5 11 17 27 32 76 79 80 84 89 91 96 102 105 108 111 112 115 0 0
INFO:root:masked_lm_ids: 51 17 9 33 31 231 109 337 1180 20 22 5 249 13 1894 626 30 24386 0 0
INFO:root:masked_lm_weights: 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0
INFO:root:next_sentence_labels: 1
INFO:root:*** Example ***
INFO:root:tokens: [CLS] now we have a quo ##rum what happened there are factor pta of which [MASK] was not aware when [MASK] had you set them up wait you set them [MASK] ver [MASK] ##a wanted to go on [MASK] date [MASK] summer s dad [SEP] i don t know 2 000 unless of course you d prefer the fellatio ##n [MASK] ##fish sick twist ##ed low life [MASK] sucking pig at least [MASK] ll have someone to talk to what [MASK] s him the [MASK] one let s go i ll wait here but he ll see me you don t know what he s [MASK] chicken lndistinct god this is fun nobody touche ##s these [MASK] me bar towel hello captain tyler metal mammoth mini ##ng [SEP]
INF

INFO:root:input_ids: 2 79 8 3165 11 4 10 344 4 3 13 5 197 169 4 28 308 324 4 30 4 38 27273 2824 21 22 8 19 7 86 19 7 2135 1128 4 4 5 11 8 60 88 5 270 9 77 18 870 105 6 1057 5 11 8 75 9 77 5 5 91 163 11 8 75 9 77 5 5 120 437 5 219 28 31189 115 105 4223 247 67 37 4 20 31 147 75 2888 4 4 4 91 323 74 4 135 323 6 112 135 4 17 275 47 17 30 5 133 74 11 4 209 45 7 2770 4 7 7042 89 1306 28 3262 17 275 105 194 833 275 14 8 3
INFO:root:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:root:segment_ids: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:root:masked_lm_posi

INFO:root:masked_lm_weights: 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0
INFO:root:next_sentence_labels: 1
INFO:root:*** Example ***
INFO:root:tokens: [CLS] or have such [MASK] mistress ##es gai ##us i think i understand my [MASK] people then perhaps caesar will be so good as to teach us out of his own extensive experience i call it love i am [MASK] [MASK] the people [MASK] my [MASK] i shall hold them [MASK] my bosom and [MASK] them tight [MASK] [SEP] hey hey hey [SEP]
INFO:root:input_ids: 2 103 33 380 4 3484 216 5166 461 6 76 6 240 28 4 142 107 699 2232 82 34 44 75 99 9 1069 94 59 16 83 248 9201 1390 6 154 11 131 6 135 4 4 7 142 4 28 4 6 562 287 104 4 28 8801 15 4 104 1229 4 3 105 105 105 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:root:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

INFO:root:input_ids: 2 8071 4 99 7 236 4 11 8 11 114 34 53 10 22258 54 1796 4 11 707 97 9809 3043 11 2559 12 1636 4 17 737 93 11 707 7 57 492 74 6 8171 13 4 14 1626 45 20 237 3887 1821 59 1752 17 5501 155 4 7 186 121 4 4 18244 59 256 2359 12 110 34 10681 9 7 4 13675 12 15 107 10380 104 177 27 4143 2032 1680 4 14 799 71 4 6 76 44 4 75 75 17 60 4 4 468 82 11 907 7 9358 44 20 40 7174 10 4 89 1680 86 9 3 52 37 18278 54 5 37 4 1143 4 5 37 42 17569 115 3
INFO:root:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:root:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 

In [27]:
import json
import imp
import os
import numpy as np
import tensorflow as tf

from model import models, data

In [28]:
with open("base_model.json") as fin:
    options = json.load(fin)
    
# https://baekyeongmin.github.io/dev/tpu-recipe-2/

def select_strategy(config) -> tf.distribute.Strategy:
    """
    Configuration을 바탕으로 Strategy를 설정합니다.
    :param config: Training / Inference Config
    :returns: tf.distribute.Strategy
    """
    if config["device"] == "GPU":
        devices: List[tf.config.PhysicalDevice] = tf.config.list_physical_devices("GPU")
        if len(devices) == 0:
            raise RuntimeError("GPU를 찾지 못했습니다. 혹시 CUDA_VISIBLE_DEVICE를 제대로 설정하셨나요?")
        if len(devices) > 1:
            strategy = tf.distribute.MirroredStrategy()
        else:
            strategy = tf.distribute.OneDeviceStrategy("/gpu:0")
    elif config["device"] == "TPU":
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=os.environ["TPU_NAME"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.TPUStrategy(resolver)
    else:
        raise ValueError(f"{config.device}는 지원되지 않는 기기입니다.")
    return strategy

In [29]:
config = {
    "device": "GPU"
}
strategy = select_strategy(config)

In [30]:
data_pattern = f"./{options['PRETRAINING_DIR']}/train*"
input_files = []
input_files.extend(tf.io.gfile.glob(data_pattern))
print(input_files)
train_dataset = data.build_interleaved_tfrecord_dataset(input_files, 128, 20, 32, 4)
train_dist_dataset  = strategy.experimental_distribute_dataset(train_dataset)

data_pattern = f"./{options['PRETRAINING_DIR']}/val*"
input_files = []
input_files.extend(tf.io.gfile.glob(data_pattern))
print(input_files)
val_dataset = data.build_interleaved_tfrecord_dataset(input_files, 128, 20, 32, 4)
val_dist_dataset  = strategy.experimental_distribute_dataset(val_dataset)

['.\\pretraining_data\\train_000.tfrecord', '.\\pretraining_data\\train_001.tfrecord', '.\\pretraining_data\\train_002.tfrecord', '.\\pretraining_data\\train_003.tfrecord', '.\\pretraining_data\\train_004.tfrecord', '.\\pretraining_data\\train_005.tfrecord', '.\\pretraining_data\\train_006.tfrecord', '.\\pretraining_data\\train_007.tfrecord', '.\\pretraining_data\\train_008.tfrecord', '.\\pretraining_data\\train_009.tfrecord', '.\\pretraining_data\\train_010.tfrecord', '.\\pretraining_data\\train_011.tfrecord', '.\\pretraining_data\\train_012.tfrecord', '.\\pretraining_data\\train_013.tfrecord', '.\\pretraining_data\\train_014.tfrecord', '.\\pretraining_data\\train_015.tfrecord', '.\\pretraining_data\\train_016.tfrecord', '.\\pretraining_data\\train_017.tfrecord', '.\\pretraining_data\\train_018.tfrecord', '.\\pretraining_data\\train_019.tfrecord', '.\\pretraining_data\\train_020.tfrecord', '.\\pretraining_data\\train_021.tfrecord', '.\\pretraining_data\\train_022.tfrecord', '.\\pretra

In [31]:
with strategy.scope():
    model = models.BertPretrainModel(options, tf.keras.activations.gelu)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-4, ))

In [32]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}_{val_loss:0.3f}.h5")

In [33]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True,
        monitor="val_loss"
    ),
]

model.fit(
    train_dist_dataset, 
    epochs=1, 
    callbacks=callbacks, 
    steps_per_epoch=300, 
    validation_data=val_dist_dataset,
    validation_steps=100
)

ValueError: in user code:

    C:\Users\seokjong\anaconda3\envs\tf_env\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\seokjong\dev\MyBert\model\models.py:106 call  *
        loss_nsp = self.nsp(
    C:\Users\seokjong\dev\MyBert\model\models.py:68 call  *
        pooled_output = self.pooled(first_token_tensor)
    C:\Users\seokjong\anaconda3\envs\tf_env\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1006 __call__  **
        with ops.name_scope_v2(name_scope):
    C:\Users\seokjong\anaconda3\envs\tf_env\lib\site-packages\tensorflow\python\framework\ops.py:6650 __enter__
        scope_name = scope.__enter__()
    C:\Users\seokjong\anaconda3\envs\tf_env\lib\contextlib.py:113 __enter__
        return next(self.gen)
    C:\Users\seokjong\anaconda3\envs\tf_env\lib\site-packages\tensorflow\python\framework\ops.py:4241 name_scope
        raise ValueError("'%s' is not a valid scope name" % name)

    ValueError: 'bert_pretrain_model/nsp/pooling layer/' is not a valid scope name
