In [1]:
import os
import shutil
import logging
import nltk

import sys
import json
import random
import sentencepiece as spm

import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
from tensorflow.keras.utils import Progbar
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model

import tensorflow_hub as hub
from transformers import BertTokenizer


# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

In [2]:
def set_tokenizer(tokenizer, max_len = 768):
  def _set_tokenizer(sentence):
    return tokenizer(
        text = sentence,
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = max_len,           # Pad & truncate all sentences.
        padding='max_length',
        return_attention_mask = True   # Construct attn. masks.
    )
  return _set_tokenizer
tokenizer = set_tokenizer(BertTokenizer.from_pretrained('bert-base-uncased'), 128)

text_test = ['this is such an amazing movie!']
hf_text_preprocessed = tokenizer(text_test)
print(f'Keys       : {list(hf_text_preprocessed.keys())}')
print(f'Word Ids   : {hf_text_preprocessed["input_ids"][0][ :12]}')
print(f'Input Mask : {hf_text_preprocessed["attention_mask"][0][ :12]}')
print(f'Type Ids   : {hf_text_preprocessed["token_type_ids"][0][ :12]}')

Keys       : ['input_ids', 'token_type_ids', 'attention_mask']
Word Ids   : [101, 2023, 2003, 2107, 2019, 6429, 3185, 999, 102, 0, 0, 0]
Input Mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
Type Ids   : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
class BertLoss:
    """
    Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
    NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
    computation.
    """

    def compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        # make sure only labels that are not equal to -100
        # are taken into account as loss
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
        )
        next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
        next_sentence_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
        )
        next_sentence_label = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
        )
        masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
        next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits)
        masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0]))
        masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

        return masked_lm_loss + next_sentence_loss


In [30]:
class EmbeddingLayer(Layer):
    
    def __init__(self, model_opt: dict, **kwargs):
        self.vocab_size = model_opt["vocab_size"]
        self.type_vocab_size = model_opt["type_vocab_size"]
        self.hidden_size = model_opt["hidden_size"]
        self.max_position_embeddings = model_opt["max_position_embeddings"]
        self.initializer_range = model_opt["initializer_range"]
        self.embeddings_sum = tf.keras.layers.Add()
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=model_opt["norm_eps"], name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=model_opt["dropout_rate"])
        super(EmbeddingLayer, self).__init__(**kwargs)
        
    def build(self):
        with tf.name_scope("wordpiece"):
            self.weight = self.add_weight(
                name="weight",
                shape=[self.vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )
        with tf.name_scope("segment"):
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.type_vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("position"):
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        super().build(input_shape)
    
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.
        Returns:
            final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        return final_embeddings

In [None]:
class TransformerEncoderLayer(Layer):
    """
    TransformerEncoderLayer : MHSA -> residual sum, normalize -> FF
    https://github.com/huggingface/transformers/blob/21e86f99e6b91af2e4df3790ba6c781e85fa0eb5/src/transformers/models/bert/modeling_tf_bert.py#L339
    """

In [None]:
class AttentionLayer(Layer):
    """
    Multihead self attention layer
    """

In [None]:
class SelfAttention(Layer):
    """
    개별 attention layer
    """
    def __init__(self, model_opt: dict, **kwargs):
        if model_opt["hidden_size"] % model_opt["num_attention_heads"] != 0:
            raise ValueError(
                f"The hidden size ({model_opt["hidden_size"]}) is not a multiple of the number "
                f"of attention heads ({model_opt["num_attention_heads"})"
            )

        self.num_attention_heads = model_opt["num_attention_heads"]
        self.attention_head_size = int(model_opt["hidden_size"] / model_opt["num_attention_heads"])
        self.all_head_size = model_opt["num_attention_heads"] * model_opt["attention_head_size"]
        self.sqrt_att_head_size = math.sqrt(model_opt["attention_head_size"])

        self.query = tf.keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = tf.keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = tf.keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
        super(SelfAttention, self).__init__(**kwargs)

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        batch_size = shape_list(hidden_states)[0]
        mixed_query_layer = self.query(inputs=hidden_states)
        mixed_key_layer = self.key(inputs=hidden_states)
        mixed_value_layer = self.value(inputs=hidden_states)
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        # (batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)

        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
            attention_scores = tf.add(attention_scores, attention_mask)

        # Normalize the attention scores to probabilities.
        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(inputs=attention_probs, training=training)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        attention_output = tf.matmul(attention_probs, value_layer)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # (batch_size, seq_len_q, all_head_size)
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)

        return outputs


In [16]:
model_opt = {
    "vocab_size" : 32000,
    "type_vocab_size" : 2,
    "hidden_size" : 128,
    "max_position_embeddings" : 128,
    "initializer_range" : 0.02,
    "norm_eps" : 0.001,
    "dropout_rate" : 0.1,
}

In [35]:
emb = EmbeddingLayer(model_opt)

In [None]:
hf_text_preprocessed["input_ids"]
hf_text_preprocessed["attention_mask"]
hf_text_preprocessed["token_type_ids"]

In [13]:
emb.summary()

AttributeError: 'EmbeddingLayer' object has no attribute 'summary'

In [31]:
class PretrainBertModel(Model):
    def __init__(self, model_opt, **kwargs):
        super(PretrainBertModel, self).__init__(name='Pretrain', **kwargs)
        self.embedding_layer = EmbeddingLayer(model_opt)
    
    def call(self, inputs):
        x = self.embedding_layer(inputs)
        
        return x        

In [32]:
model = PretrainBertModel(model_opt)

In [None]:
full_position_embeddings = tf.get_variable(
  name="test",
  shape=[128, 128],
  initializer=tf.keras.initializers.TruncatedNormal())

# Since the position embedding table is a learned variable, we create it
# using a (long) sequence length `max_position_embeddings`. The actual
# sequence length might be shorter than this, for faster training of
# tasks that do not have long sequences.
#
# So `full_position_embeddings` is effectively an embedding table
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
# perform a slice.
position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                             [seq_length, -1])
num_dims = len(output.shape.as_list())

# Only the last two dimensions are relevant (`seq_length` and `width`), so
# we broadcast among the first dimensions, which is typically just
# the batch size.
position_broadcast_shape = []
for _ in range(num_dims - 2):
position_broadcast_shape.append(1)
position_broadcast_shape.extend([seq_length, width])
position_embeddings = tf.reshape(position_embeddings,
                               position_broadcast_shape)
output += position_embeddings


TensorShape([128, 128])

TensorShape([128, 128])

In [2]:
initializer = tf.keras.initializers.TruncatedNormal()
full_position_embeddings = tf.Variable(
  initializer(shape = (128, 128)),
  name="test")
full_position_embeddings.shape
position_embeddings = tf.slice(full_position_embeddings, [0, 0], [128, -1])
position_embeddings.shape
num_dims = 3
position_broadcast_shape = []
for _ in range(num_dims - 2):
    position_broadcast_shape.append(1)
    position_broadcast_shape.extend([128, 128])
    position_embeddings = tf.reshape(position_embeddings,
                                   position_broadcast_shape)


In [3]:
position_embeddings

<tf.Tensor: shape=(1, 128, 128), dtype=float32, numpy=
array([[[ 0.01385949,  0.05264647,  0.06343699, ...,  0.09757596,
         -0.03972503,  0.05346877],
        [-0.03932196,  0.01364289,  0.03254468, ..., -0.02289535,
          0.02897022,  0.04119596],
        [ 0.0434962 ,  0.07995131,  0.01936734, ..., -0.02690749,
          0.09007739, -0.04668866],
        ...,
        [-0.01632217,  0.02246051, -0.0352938 , ...,  0.00064092,
         -0.00771515, -0.03805603],
        [-0.02785651, -0.03085334,  0.06839695, ...,  0.00056849,
         -0.0328668 , -0.01278302],
        [ 0.00249366,  0.00049937,  0.07355256, ..., -0.03969698,
         -0.09182624,  0.06338918]]], dtype=float32)>

In [4]:
class PositionEmbeddingLayer(Layer):
    max_position_embeddings  = 512
    hidden_size              = 128

    # noinspection PyUnusedLocal
    def _construct(self, **kwargs):
        super()._construct(**kwargs)
        self.embedding_table = None

    # noinspection PyAttributeOutsideInit
    def build(self, input_shape):
        # input_shape: () of seq_len
        if input_shape is not None:
            assert input_shape.ndims == 0
            self.input_spec = keras.layers.InputSpec(shape=input_shape, dtype='int32')
        else:
            self.input_spec = keras.layers.InputSpec(shape=(), dtype='int32')

        self.embedding_table = self.add_weight(name="embeddings",
                                               dtype=K.floatx(),
                                               shape=[self.params.max_position_embeddings, self.params.hidden_size],
                                               initializer=self.create_initializer())
        super(PositionEmbeddingLayer, self).build(input_shape)

    # noinspection PyUnusedLocal
    def call(self, inputs, **kwargs):
        # just return the embedding after verifying
        # that seq_len is less than max_position_embeddings
        seq_len = inputs

        assert_op = tf.compat.v2.debugging.assert_less_equal(seq_len, self.params.max_position_embeddings)

        with tf.control_dependencies([assert_op]):
            # slice to seq_len
            full_position_embeddings = tf.slice(self.embedding_table,
                                                [0, 0],
                                                [seq_len, -1])
        output = full_position_embeddings


In [7]:
a = PositionEmbeddingLayer()

In [8]:
a.trainable_weights

[]