In [1]:
import tensorflow as tf 
from tensorflow import keras
import json
import os
import sys

In [2]:
from get_config import get_config

In [3]:
# 初始化超参字典
gConf = {}
gConf = get_config()

# 通过超参字典为vocab_in_size,vocab_tar_size,embedding_dim,units等赋值
vocab_inp_size = gConf["vocab_inp_size"]
vocab_tar_size = gConf["vocab_tar_size"]
embedding_dim = gConf["embedding_dim"]
vocab_inp_path = gConf["vocab_inp_path"]
vocab_tar_path = gConf["vocab_tar_path"]
units = gConf["layer_size"]
BATCH_SIZE = gConf["batch_size"]

max_length_inp = gConf["max_length"]
max_length_tar = gConf["max_length"]
model_path = gConf["model_data"]

E:\GitHub\ML\nlp\聊天机器人\tf2.6_prj/seq2seq.ini


In [11]:
encode_inputs = keras.Input(shape=(20),batch_size=BATCH_SIZE,dtype=tf.int32,name="encode_inps")

In [12]:
encode_inputs

<KerasTensor: shape=(64, 20) dtype=int32 (created by layer 'encode_inps')>

In [13]:
decode_inputs = keras.Input(shape=(1),batch_size=BATCH_SIZE,dtype=tf.int32,name="decode_inps")

In [14]:
decode_inputs

<KerasTensor: shape=(64, 1) dtype=int32 (created by layer 'decode_inps')>

In [17]:
encode_embed =keras.layers.Embedding(input_dim= vocab_inp_size,output_dim = embedding_dim, name="encode_embed")
encode_embed

<keras.layers.embeddings.Embedding at 0x28ee11196a0>

In [21]:
encode_inputs_embed = encode_embed(encode_inputs)
encode_inputs_embed

<KerasTensor: shape=(64, 20, 128) dtype=float32 (created by layer 'encode_embed')>

In [19]:
encode_gru=keras.layers.GRU(units,return_sequences=True, return_state=True,name="encode_gru",recurrent_initializer="glorot_uniform")
encode_gru

<keras.layers.recurrent_v2.GRU at 0x28ee09cf518>

In [23]:
encode_outs, encode_state = encode_gru(encode_inputs_embed)
print(encode_outs)
print(encode_state)

KerasTensor(type_spec=TensorSpec(shape=(64, 20, 256), dtype=tf.float32, name=None), name='encode_gru/PartitionedCall:1', description="created by layer 'encode_gru'")
KerasTensor(type_spec=TensorSpec(shape=(64, 256), dtype=tf.float32, name=None), name='encode_gru/PartitionedCall:2', description="created by layer 'encode_gru'")


In [24]:
decode_embed =keras.layers.Embedding(input_dim= vocab_tar_size,output_dim = embedding_dim, name="decode_embed")
decode_embed

<keras.layers.embeddings.Embedding at 0x28eef92a7f0>

In [25]:
decode_inputs_embed = decode_embed(decode_inputs)
decode_inputs_embed

<KerasTensor: shape=(64, 1, 128) dtype=float32 (created by layer 'decode_embed')>

In [26]:
decode_gru=keras.layers.GRU(units,return_sequences=True, return_state=True,name="decode_gru",recurrent_initializer="glorot_uniform")
decode_gru

<keras.layers.recurrent_v2.GRU at 0x28eef8f2978>

In [28]:
decode_outs, decode_state  = decode_gru(decode_inputs_embed,initial_state=encode_state)
print(decode_outs)
print(decode_state)

KerasTensor(type_spec=TensorSpec(shape=(64, 1, 256), dtype=tf.float32, name=None), name='decode_gru/PartitionedCall:1', description="created by layer 'decode_gru'")
KerasTensor(type_spec=TensorSpec(shape=(64, 256), dtype=tf.float32, name=None), name='decode_gru/PartitionedCall:2', description="created by layer 'decode_gru'")


In [30]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        # 注意力网络的初始化
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    """
    传入值：
        features：编码器的输出，(64, 16, 1024) 即 (BATCH_SIZE, 输入序列最大长度句子的长度, 隐藏层中的隐藏神经元数量)
        hidden：解码器的隐层输出状态，(64, 1024) 即 (batch_size, hidden_size) (BATCH_SIZE, 隐藏层中的隐藏神经元数量)
    返回值：
        attention_result：(64, 1024) 即 (batch size, units) (BATCH_SIZE, 隐藏层中的隐藏神经元数量)
        attention_weights：(64, 16, 1) 即 (batch_size, sequence_length, 1) (BATCH_SIZE, 输入序列最大长度句子的长度, 1)
    """

    def call(self, features, hidden):
        """
        description: 具体计算函数
        :param features: 编码器的输出
        :param hidden: 解码器的隐层输出状态
        return: 通过注意力机制处理后的结果和注意力权重attention_weights
        """
        """
        1.hidden_with_time_axis = tf.expand_dims(hidden, 1)
                解码器的隐层输出状态hidden，(64, 1024) 即 (batch_size, hidden_size) (BATCH_SIZE, 隐藏层中的隐藏神经元数量)。
                hidden扩展一个维度从(64, 1024)变成(64, 1,1024)。
        2.score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
                计算注意力得分score。
                features：编码器的输出，(64, 16, 1024)。
                hidden_with_time_axis：解码器的隐层输出状态，(64, 1,1024)
                W1和W2：Dense(隐藏层中的隐藏神经元数量1024)
                tanh(W1(features) + W2(hidden_with_time_axis))：
                ---> tanh(W1((64, 16, 1024)) + W2((64, 1,1024)))
                ---> tanh((64, 16, 1024))
                ---> (64, 16, 1024) 即 (BATCH_SIZE, 输入序列最大长度句子的长度, 隐藏层中的隐藏神经元数量)
        3.attention_weights = tf.nn.softmax(self.V(score), axis=1)
                计算注意力权重attention_weights。
                V：Dense(隐藏层中的隐藏神经元数量1)
                softmax(V(score), axis=1)
                ---> softmax(V((64, 16, 1024)), axis=1)
                ---> softmax((64, 16, 1), axis=1)
                ---> (64, 16, 1) 即 (BATCH_SIZE, 输入序列最大长度句子的长度, 1)
                因为注意力得分score的形状是(BATCH_SIZE, 输入序列最大长度句子的长度, 隐藏层中的隐藏神经元数量)，
                输入序列最大长度句子的长度(max_length)是输入的长度。
                因为我们想为每个输入长度分配一个权重，所以softmax应该用在第一个轴(max_length)上axis=1，
                而softmax默认被应用于最后一个轴axis=-1。
        4.context_vector = tf.reduce_sum(attention_weights * features, axis=1)
                获得注意力机制处理后的结果context_vector。
                reduce_sum(attention_weights * features, axis=1)
                ---> reduce_sum((64, 16, 1) * (64, 16, 1024), axis=1)
                ---> reduce_sum((64, 16, 1024), axis=1)
                ---> (64, 1024) 即 (BATCH_SIZE, 隐藏层中的隐藏神经元数量)
        """
        # 将hidden增加一个维度,(batch_size, hidden_size) --> (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        # 根据公式计算注意力得分, 输出score的形状为: (batch_size, 16, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        # 根据公式计算注意力权重, 输出attention_weights形状为: (batch_size, 16, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        # 最后根据公式获得注意力机制处理后的结果context_vector
        # context_vector的形状为: (batch_size, hidden_size)
        context_vector = attention_weights * features
        # 将乘机后的context_vector按行相加，进行压缩得到最终的context_vector
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [32]:
atten_layer = BahdanauAttention(units)
atten_layer

<__main__.BahdanauAttention at 0x28f03865f98>

In [34]:
context_vector, attention_weights = atten_layer(encode_outs,decode_state)
print(context_vector)
print(attention_weights)

KerasTensor(type_spec=TensorSpec(shape=(64, 256), dtype=tf.float32, name=None), name='bahdanau_attention/Sum:0', description="created by layer 'bahdanau_attention'")
KerasTensor(type_spec=TensorSpec(shape=(64, 20, 1), dtype=tf.float32, name=None), name='bahdanau_attention/transpose_1:0', description="created by layer 'bahdanau_attention'")


In [39]:
tf.keras.layers.Concatenate(axis=-1, name='concat_layer')([tf.expand_dims(context_vector, 1), decode_inputs_embed])

<KerasTensor: shape=(64, 1, 384) dtype=float32 (created by layer 'concat_layer')>