In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

2023-04-09 21:20:37.344707: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 位置编码矩阵，输入length为序列长度，depth为单个token向量的长度
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [3]:
# 位置编码层
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    # 使用全链接层对输入向量（已经编码）映射到模型的维度
    self.embedding = tf.keras.layers.Dense(d_model, activation='relu') 
    self.pos_encoding = positional_encoding(length=256, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
#     assert length <= 256, "序列长度不能超过256"
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [4]:
positional_encoding(4, 6)

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         1.        ],
       [ 0.84147096,  0.04639922,  0.00215443,  0.5403023 ,  0.998923  ,
         0.9999977 ],
       [ 0.9092974 ,  0.0926985 ,  0.00430886, -0.41614684,  0.9956942 ,
         0.9999907 ],
       [ 0.14112   ,  0.1387981 ,  0.00646326, -0.9899925 ,  0.9903207 ,
         0.99997914]], dtype=float32)>

In [5]:
# 基础注意力层，一个多头注意力层、一个正则化层，一个残差add操作
# 输入一个头数和向量维数
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [6]:
# 交叉注意力层，输入x查询向量，会在上下文key、value中查询
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores
    # 残差连接
    x = self.add([x, attn_output])
    # 层norm化
    x = self.layernorm(x)

    return x

In [7]:
cross = CrossAttention(num_heads=1, key_dim=4)
cross(tf.ones((1,3,2)), tf.ones((1,4,2)))

<tf.Tensor: shape=(1, 3, 2), dtype=float32, numpy=
array([[[ 0.99381536, -0.99381536],
        [ 0.99381536, -0.99381536],
        [ 0.99381536, -0.99381536]]], dtype=float32)>

In [8]:
# 自注意力层
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [9]:
# 带因果掩码的自注意力层
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [10]:
# 前馈层
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

In [11]:
# 编码器
# 输入模型维数（即输入单个向量的维数，因为transformer层不改变向量维数），多头头数，前馈中间层大小，drop率
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [12]:
sample_encoder_layer = EncoderLayer(d_model=2, num_heads=8, dff=2048, dropout_rate=0.1)
pt = tf.ones((1,3,2))
print(pt.shape)
print(sample_encoder_layer(pt).shape)

(1, 3, 2)
(1, 3, 2)


In [13]:
# 编码器
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [14]:
# 解码器层
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [15]:
sample_decoder_layer = DecoderLayer(d_model=2, num_heads=8, dff=2048)

x = tf.ones((1,3,2))
context = pt = tf.ones((1,20,2))
sample_decoder_layer_output = sample_decoder_layer(
    x=x, context=context)

print(x.shape)
print(context.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(1, 3, 2)
(1, 20, 2)
(1, 3, 2)


In [16]:
# 解码器
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [17]:
sample_decoder = Decoder(num_layers=1, d_model=2, vocab_size=3, num_heads=8, dff=2048)

x = tf.ones((1,3,2))
context = pt = tf.ones((1,20,2))
sample_decoder_output = sample_decoder(
    x=x, context=context)

print(x.shape)
print(context.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(1, 3, 2)
(1, 20, 2)
(1, 3, 2)


In [18]:
# transformer最终结构
# output_size为最终输出的向量的维数，在文本翻译过程中是词向量的长度，但在这里是flutter组件节点向量
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    print(x.shape)
    print(context.shape)
    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [20]:
# 损失函数
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [21]:
# 学习率
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def get_config(self):
    config = {
    'd_model': self.d_model,
    'warmup_steps': self.warmup_steps,

     }
    return config

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [22]:
# 超参数
num_layers = 4
d_model = 126
dff = 512
num_heads = 8
dropout_rate = 0.1


transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=10,
    target_vocab_size=20,
    dropout_rate=dropout_rate)

# 优化器
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer.compile(
    loss='mean_squared_error',
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [24]:
transformer.load_weights("model")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x13e632160>

In [52]:
transformer.fit((tf.ones((1,3,10)), tf.ones((1,3,20))),tf.ones((1,3,20)),
                epochs=200,)

Epoch 1/200
(None, 3, 20)
(None, 3, 126)


InvalidArgumentError: Graph execution error:

Detected at node 'gradient_tape/mean_squared_error/BroadcastGradientArgs' defined at (most recent call last):
    File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2961, in run_cell
      result = self._run_cell(
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3016, in _run_cell
      result = runner(coro)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3221, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3400, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/qv/lkzvp5v136j4x83fvc1qbgs00000gn/T/ipykernel_3223/2181739854.py", line 1, in <module>
      transformer.fit((tf.ones((1,3,10)), tf.ones((1,3,20))),tf.ones((1,3,21)),
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/engine/training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/engine/training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/engine/training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/engine/training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/engine/training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 576, in minimize
      grads_and_vars = self._compute_gradients(
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 634, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "/Users/archer/Documents/jupyter/venv_workspace/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/mean_squared_error/BroadcastGradientArgs'
Incompatible shapes: [1,3,20] vs. [1,3,21]
	 [[{{node gradient_tape/mean_squared_error/BroadcastGradientArgs}}]] [Op:__inference_train_function_27504]

In [26]:
transformer([tf.ones((1,3,10)), tf.ones((1,1,20))], training=False) - tf.ones((1,1,20))

(1, 1, 20)
(1, 3, 126)


<tf.Tensor: shape=(1, 1, 20), dtype=float32, numpy=
array([[[-0.05242896, -0.02938414,  0.05024254,  0.04878366,
          0.02481437, -0.03367698, -0.01460284,  0.11665189,
         -0.05826628, -0.01462948, -0.0827505 ,  0.00564265,
          0.0510062 , -0.0692454 ,  0.03574026,  0.01222372,
          0.05528903, -0.02998143,  0.05311131, -0.00769812]]],
      dtype=float32)>

In [27]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  2566778   
                                                                 
 decoder_1 (Decoder)         multiple                  4613774   
                                                                 
 dense_25 (Dense)            multiple                  2540      
                                                                 
Total params: 7,183,092
Trainable params: 7,183,092
Non-trainable params: 0
_________________________________________________________________


In [53]:
transformer.save_weights("model")

In [None]:
## 在nlp中，最后一层通过dense层，将模型维数扩展为词汇数，通过softmax取得最大概率的坐标，这个坐标和实际值zuo cha