In [1]:
import tensorflow as tf
import json

In [2]:
options = {
    "MAX_SEQ_LENGTH": 128,
    "MASKED_LM_PROB": 0.15,
    "MAX_PREDICTIONS": 20,
    "DO_LOWER_CASE": True,
    "PROCESSES": 2,
    "INPUT_DIR": "shards",
    "PRETRAINING_DIR": "pretraining_data",
    "DUPE_FACTOR": 5,
    "SHORT_SEQ_PROB": 0.1,
    "BATCH_SIZE" : 32,
    "VOCAB_SIZE" : 32000,
    "VOCAB_TOKEN" : 2,
    "HIDDEN_SIZE" : 128,
    "MAX_POS_EMBED" : 128,
    "NORM_EPS" : 0.001,
    "DROPOUT_RATE" : 0.1,
    "INIT_RANGE" : 0.02,
    "ATT_HEAD" : 2,
    "ATT_HEAD_SIZE" : 0.02,
    "INIT_RANGE" : 0.02,
}

In [3]:
with open("base_model.json", "w") as fout:
    json.dump(options, fout, indent=4)

In [4]:
def input_fn_builder(input_files,
                     max_seq_length,
                     max_predictions_per_seq,
                     is_training,
                     num_cpu_threads=4):
  def input_fn(params):
    """The actual input function."""
    batch_size = params["BATCH_SIZE"]

    name_to_features = {
        "input_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "masked_lm_positions":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_ids":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_weights":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
        "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
    }

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    if is_training:
      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
      d = d.repeat()
      d = d.shuffle(buffer_size=len(input_files))
      # `cycle_length` is the number of parallel files that get read.
      cycle_length = min(num_cpu_threads, len(input_files))
      d = d.interleave(
              tf.data.TFRecordDataset,
              cycle_length=cycle_length
      )
      d = d.shuffle(buffer_size=100)
    else:
      d = tf.data.TFRecordDataset(input_files)
      # Since we evaluate for a fixed number of steps we don't want to encounter
      # out-of-range exceptions.
      d = d.repeat()

    # We must `drop_remainder` on training because the TPU requires fixed
    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
    # and we *don't* want to drop the remainder, otherwise we wont cover
    # every sample.
    d = d.map(lambda record: _decode_record(record, name_to_features),
            num_parallel_calls=num_cpu_threads)
    d = d.batch(batch_size=batch_size, drop_remainder=True)
    return d

  return input_fn


def _decode_record(record, name_to_features):
  """Decodes a record to a TensorFlow example."""
  example = tf.io.parse_single_example(record, name_to_features)

  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
  # So cast all int64 to int32.
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.cast(t, tf.int32)
    example[name] = t

  return example


In [5]:
data_pattern = f"./{options['PRETRAINING_DIR']}/*"
input_files = []
input_files.extend(tf.io.gfile.glob(data_pattern))
builder = input_fn_builder(input_files, 128, 20, True, 4)
params = options
data = builder(params)

In [6]:
for i in data:
    d = i
    break

In [7]:
d

{'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
 array([[    2,    26,    11, ..., 19171,  3224,     3],
        [    2,  1032,  1670, ...,     0,     0,     0],
        [    2,    11,     4, ...,     4,  1838,     3],
        ...,
        [    2,    45,    20, ...,     0,     0,     0],
        [    2,    65,   546, ...,     0,     0,     0],
        [    2,    28,   693, ...,     0,     0,     0]])>,
 'input_mask': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>,
 'masked_lm_ids': <tf.Tensor: shape=(32, 20), dtype=int32, numpy=
 array([[   12,     5,  2459,   103,   250,   218, 17135,    12,  1553,
            12,   103,   517,  3492,  2620,   983, 12622,    53,    16,
             7,     0],
        [  100,   783,     0,     0,     0,     0,     0,     0,   

In [6]:
d.keys()

dict_keys(['input_ids', 'input_mask', 'masked_lm_ids', 'masked_lm_positions', 'masked_lm_weights', 'next_sentence_labels', 'segment_ids'])

In [16]:
for k, v in d.items():
    print(f"{k}: {v[3]}")

input_ids: [    2    11     9   561    11     9    37   561     4     9   124  7059
   184   561    11     9   346   146    37   561    63    42    18     4
    11     9   239    16   962     4     4    38    27     7  1470     4
     4   277   469    88     4   190   376  3837     3     6   890     4
   812     8  1873     4  2233   596   168   634    22  6660    12     7
  2051    58    14     4     4   173    14   105     6   890    48  1964
     5    33   493     8    34    18    10   114   238  3736    45     5
   104    18    27    11    41 11966  2306     5  1097    17   137     6
    92 27167    30     6    25    10  6157    37    10  1081    45     6
   849     4    95   191   278 25091   121   423  2481    10   203    80
   109     7  3364     8   923    10   924     3]
input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [18]:
for k, v in d.items():
    print(f"{k}: {v[31]}")

input_ids: [    2     6    48     4    26  2131    12     3    22     9    71   660
 21465    12     3     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [23]:
d["input_mask"]

<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [21]:
def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  # (batch_size, 1, 1, key의 문장 길이)
  return mask[:, tf.newaxis, tf.newaxis, :]

In [40]:
extended_attention_mask = tf.reshape(d["input_mask"], (input_shape[0], 1, 1, input_shape[1]))
extended_attention_mask = tf.cast(tf.equal(extended_attention_mask, 0), tf.float32)
extended_attention_mask = tf.multiply(extended_attention_mask, tf.constant(-10000.0, dtype=tf.float32))

<tf.Tensor: shape=(32, 1, 1, 128), dtype=float32, numpy=
array([[[[    -0.,     -0.,     -0., ...,     -0.,     -0.,     -0.]]],


       [[[    -0.,     -0.,     -0., ...,     -0.,     -0.,     -0.]]],


       [[[    -0.,     -0.,     -0., ...,     -0.,     -0.,     -0.]]],


       ...,


       [[[    -0.,     -0.,     -0., ...,     -0.,     -0.,     -0.]]],


       [[[    -0.,     -0.,     -0., ..., -10000., -10000., -10000.]]],


       [[[    -0.,     -0.,     -0., ..., -10000., -10000., -10000.]]]],
      dtype=float32)>

In [32]:
input_shape = d["input_mask"].shape
one_cst = tf.constant(1.0, dtype=tf.float32)
ten_thousand_cst = tf.constant(-10000.0, dtype=tf.float32)
extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)

InvalidArgumentError: cannot compute Sub as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:Sub]

In [29]:
extended_attention_mask

<tf.Tensor: shape=(32, 1, 1, 128), dtype=int32, numpy=
array([[[[1, 1, 1, ..., 1, 1, 1]]],


       [[[1, 1, 1, ..., 1, 1, 1]]],


       [[[1, 1, 1, ..., 1, 1, 1]]],


       ...,


       [[[1, 1, 1, ..., 1, 1, 1]]],


       [[[1, 1, 1, ..., 0, 0, 0]]],


       [[[1, 1, 1, ..., 0, 0, 0]]]])>

In [38]:
import imp

In [45]:
from model import embedding
imp.reload(embedding)

<module 'model.embedding' from 'C:\\Users\\seokjong\\dev\\MyBert\\model\\embedding.py'>

In [46]:
layer = embedding.EmbeddingLayer(options)

In [47]:
layer(d['input_ids'])

<tf.Tensor: shape=(32, 128, 128), dtype=float32, numpy=
array([[[-0.48712355, -0.02978358,  0.17945506, ...,  0.8133659 ,
         -1.3185583 , -0.34859776],
        [-0.38272315, -0.3418512 ,  0.18923788, ..., -0.49261922,
         -0.69630814, -0.0270364 ],
        [-0.35029596, -0.6971725 ,  0.20470506, ..., -0.08726028,
         -1.5033354 ,  0.22702336],
        ...,
        [-0.18563168, -0.60740757, -0.5342733 , ..., -0.9799013 ,
         -1.1144719 ,  0.2728641 ],
        [-0.10508169,  0.16639659, -0.87073743, ..., -0.78892237,
         -1.112452  ,  0.2574428 ],
        [-0.46620467,  0.02814792, -0.04137228, ..., -0.40720013,
         -1.4783251 , -0.05667265]],

       [[-0.48712355, -0.02978358,  0.17945506, ...,  0.8133659 ,
         -1.3185583 , -0.34859776],
        [-0.54493445, -0.49895576,  0.05773858, ...,  0.16728738,
         -0.65363604, -1.0445409 ],
        [-0.16164392, -0.5100623 , -0.21532449, ...,  0.41354927,
         -0.5289586 , -0.6852402 ],
        ...

<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[   2,    5,  684, ...,    0,    0,    0],
       [   2, 1191,    9, ...,    5,   86,    3],
       [   2,    6,   32, ...,   10,  498,    3],
       ...,
       [   2,    5,  159, ...,    0,    0,    0],
       [   2,    6,    4, ...,  183, 1221,    3],
       [   2,   53,  276, ...,    5,   40,    3]])>