# 1. 模型Layer、Model构建

##  1.1 一些tensor操作

### tf.concat ：矩阵拼接

In [1]:
import tensorflow as tf
print(tf.__version__)

2.0.0


In [7]:
t1 = [[1, 2, 3], [4, 5, 6]] # 2, 3
t2 = [[7, 8, 9], [10, 11, 12]] 
tf.concat([t1, t2], axis=1) #axis=1时，将t2从水平方向拼接到t1的右边。axis=0，将t2从垂直方向拼接到t1下面

<tf.Tensor: id=22, shape=(2, 6), dtype=int32, numpy=
array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]], dtype=int32)>

### tf.expand_dims ：增加维度

In [15]:
t3 = [[1, 2, 3],[4, 5, 6]] # shape [2, 3] 

In [16]:
tf.expand_dims(t3, axis=0) 

<tf.Tensor: id=39, shape=(1, 2, 3), dtype=int32, numpy=
array([[[1, 2, 3],
        [4, 5, 6]]], dtype=int32)>

In [17]:
tf.expand_dims(t3, 1) 

<tf.Tensor: id=42, shape=(2, 1, 3), dtype=int32, numpy=
array([[[1, 2, 3]],

       [[4, 5, 6]]], dtype=int32)>

In [18]:
tf.expand_dims(t3, 2) 

<tf.Tensor: id=45, shape=(2, 3, 1), dtype=int32, numpy=
array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]], dtype=int32)>

### tf.squeeze ：删除维度

In [19]:
t4 = tf.expand_dims(t3, 2) 

In [20]:
t4

<tf.Tensor: id=48, shape=(2, 3, 1), dtype=int32, numpy=
array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]], dtype=int32)>

In [21]:
tf.squeeze(t4, 2)

<tf.Tensor: id=49, shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

### tf.reshape

### tf.cast ：转换类型

In [22]:
x = tf.constant([1.8, 2.2], dtype=tf.float32)
tf.dtypes.cast(x, tf.int32) 
# mask = [True , False] loss.astype

<tf.Tensor: id=51, shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>

### tf.stack ：堆积

In [23]:
x = tf.constant([1, 4]) 
y = tf.constant([2, 5]) 
z = tf.constant([3, 6]) 

In [24]:
tf.stack([x, y, z], axis=0) 

<tf.Tensor: id=55, shape=(3, 2), dtype=int32, numpy=
array([[1, 4],
       [2, 5],
       [3, 6]], dtype=int32)>

In [25]:
tf.stack([x, y, z], axis=1) 

<tf.Tensor: id=56, shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

## 1.2 Layer

### Encoder层

In [26]:
class Encoder(tf.keras.layers.Layer):
    # enc_units：RNN，LSTM等单元数量
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embedding_matrix):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        #将一个特征转换为一个向量。比如最容易理解的one-hot编码。但在实际应用当中，将特征转换为one-hot编码后维度会十分高。
        #所以我们会将one-hot这种稀疏特征转化为稠密特征，通常做法也就是转化为我们常用的embedding
        # weights：词向量矩阵，trainable：是否进行训练
        self.embedding = tf.keras.layers.Embedding(vocab_size,
                                                   embedding_dim,
                                                   weights=[embedding_matrix],
                                                   trainable=False)
        # 创建gru单元
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    #进行encoder计算
    #x：输入词的词向量，hidden：前一个词的状态变量
    def call(self, x, hidden):
        x = self.embedding(x)  #词向量的embedding，即转换为稠密特征
        output, state = self.gru(x, initial_state=hidden) #output, state对应gru的两个输出
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    
    

### Attention层

In [27]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    # dec_hidden：decoder hidden state，每一步（一个词）都不一样，因此每步传入的都不一样
    # enc_output：encoder output：每一个batch_size训练中，训练完最后一个词后统一输出endoer output，因此在每一步训练中
    # enc_output是不变的。enc_output中包含了sequence_len的值
    def call(self, dec_hidden, enc_output):
        """
        :param dec_hidden: shape=(16, 256)
        :param enc_output: shape=(16, 4, 256) #4：sequence_len=4，即每个样本中有4个词
        """
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size) #每个词对应的的hidden state的维度
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(dec_hidden, 1)  # shape=(16, 1, 256)，维度抓换，为了后面的矩阵运算score
        # att_features = self.W1(enc_output) + self.W2(hidden_with_time_axis)

        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)  #b_attn：偏置量，可加可不加
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))  # shape=(16, 200, 1)
        attn_dist = tf.nn.softmax(score, axis=1) #attention weights
        
        # context_vector shape after sum == (batch_size, hidden_size, 1)
        attn_dist = tf.expand_dims(attn_dist, axis=2)
        # 每个词的attention weights和enc_output相乘，再相加即得到context vector
        # 矩阵运算，已经包含了所有词的attn_dist * enc_output
        context_vector = attn_dist * enc_output  # shape=(16, 200, 256)
        context_vector = tf.reduce_sum(context_vector, axis=1)  # shape=(16, 256)
        return context_vector, tf.squeeze(attn_dist, -1)

### Decoder层

In [28]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, embedding_matrix):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                                   weights=[embedding_matrix],
                                                   trainable=False)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        # fc层.
        self.fc = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax)

    # 开始计算
    # x：decoder输入词的词向量
    def call(self, x, context_vector):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) # 与context vecotr进行拼接
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        # output shape == (batch_size, vocab)
        out = self.fc(output)
        return x, out, state

## 1.3 Model  
搭建seq2seq模型

In [31]:
#Model具有反向传播等特性
class SEQ2SEQ(tf.keras.Model):
    # 没写完整
    def __init__(self):
        
    
    def call(self, enc_output, dec_hidden, enc_inp, dec_inp):
        predictions = []
        attentions = []

        context_vector, attn_dist = self.attention(dec_hidden,  # shape=(16, 256)
                                                    enc_output,  # shape=(16, 200, 256)
                                                    )

        for t in range(dec_inp.shape[1]):
            #传入的参数与Attention机制图的参数对应
            #dec_inp:decoder 输入, dec_hidden:前一个词的状态变量，enc_output：encoder输出
#             dec_x, pred, dec_hidden = self.decoder(tf.expand_dims(dec_inp[:, t], 1),
#                                                    dec_hidden,
#                                                    enc_output,
#                                                    context_vector)

            #由于前面定义decoder层时（call(self, x, context_vector):），只用到了decoder的输入词和context_vector，
            #因此此处只用两个参数
            dec_x, pred, dec_hidden = self.decoder(tf.expand_dims(dec_inp[:, t], 1),
                                                   context_vector)
            #每训练一步，dec_hidden都要重新输入
            context_vector, attn_dist, coverage_next = self.attention(dec_hidden, enc_output)
            predictions.append(pred)    #预测值
            attentions.append(attn_dist)
 
        return tf.stack(predictions, 1), dec_hidden, attentions

IndentationError: expected an indented block (<ipython-input-31-07f725cf71c5>, line 6)

# 2 loss function

In [32]:
#分类交叉熵
cce = tf.keras.losses.CategoricalCrossentropy()
loss = cce(
  [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], # real，真实值（标签）
  [[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]]) # prediction，预测值
print('Loss: ', loss.numpy()) 

Loss:  0.09458993


In [33]:
#稀疏分类交叉熵
# 上课老师代码就报错
sce = tf.keras.losses.SparseCategoricalCrossentropy()
loss = sce(
  [1, 2], # shape=(3,) real index，真实值可以不传其one-hot编码，传其在字典中的index
  [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
print('Loss: ', loss.numpy()) 

AttributeError: 'list' object has no attribute 'op'

## 优化器optimizer

In [34]:
# 优化器调用api
optimizer = tf.keras.optimizers.Adagrad(params['learning_rate'],
                                        initial_accumulator_value=params['adagrad_init_acc'],
                                        clipnorm=params['max_grad_norm'])

NameError: name 'params' is not defined

## loss函数

In [None]:
# loss调用api
# from_logits与decoder层中fc层的activation对应，activation设置了激活函数，则此处也就为True，如果没有激活函数，则此处为False
# reduction参数不用管
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
# Whether y_pred is expected to be a logits tensor. 

## 使用了交叉熵api的LOSS函数  
自定义loss函数

In [None]:
# 定义loss函数
[2, 4, 5, 6, 7, 1, 1, 1, 1, 1]
def loss_function(real, pred):
    # 判断logit为1和0的数量
    mask = tf.math.logical_not(tf.math.equal(real, 1))
    # 计算decoder的长度
    dec_lens = tf.reduce_sum(tf.cast(mask, dtype=tf.float32), axis=-1)
    # 计算loss值
    loss_ = loss_object(real, pred)
    # 转换mask的格式
    mask = tf.cast(mask, dtype=loss_.dtype)
    # 调整loss
    loss_ *= mask
    # 确认下是否有空的摘要别加入计算
    loss_ = tf.reduce_sum(loss_, axis=-1) / dec_lens
    return tf.reduce_mean(loss_)

## 自己diy的LOSS函数

In [None]:
def pgn_log_loss_function(real, final_dists, padding_mask):
    # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words
    loss_per_step = []  # will be list length max_dec_steps containing shape (batch_size)
    batch_nums = tf.range(0, limit=real.shape[0])  # shape (batch_size)
    for dec_step, dist in enumerate(final_dists):
        # The indices of the target words. shape (batch_size)
        targets = real[:, dec_step]
        indices = tf.stack((batch_nums, targets), axis=1)  # shape (batch_size, 2)
        gold_probs = tf.gather_nd(dist, indices)  # shape (batch_size). prob of correct words on this step
        losses = -tf.math.log(gold_probs)
        loss_per_step.append(losses)
    # Apply dec_padding_mask and get loss
    _loss = _mask_and_avg(loss_per_step, padding_mask)
    return _loss

## 建立train-step

In [None]:
# @tf.function
def train_step(inp, targ, enc_hidden):
    # 初始化loss
    loss = 0
    with tf.GradientTape() as tape:
        # encoder实例化，同时也是初始化的过程
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        # 确定decoder的输入
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        # 开始train的过程
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            # 计算第一步输出后的loss值
            loss += loss_function(targ[:, t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    # 计算一个训练样本的loss
    batch_loss = (loss / int(targ.shape[1]))
    # 反向梯度求导
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(batch_loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss