# 循环神经网络

In [18]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [19]:
from tensorflow.keras import layers, Model, Input, Sequential, datasets
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
gpus = tf.config.experimental.list_physical_devices('GPU')
try:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        print(gpu)
except RuntimeError as e:
    print(e)

## 序列
具有先后顺序的数据一般叫作序列(Sequence). 我们把文字编码为数值的过程叫作**Word Embedding**.

one-hot编码的优缺点:
- 简单直观，编码过程不需要学习和训练;
- 但高维度而且极其稀疏的，大量的位置为0，计算效率较低, 忽略了单词先天具有的语义相关性;

余弦相关度(Cosine similarity), 衡量词向量(word vector)之间相关度:
$$similarity(a, b) \triangleq \frac {a \cdot b}{|a|\cdot|b|}$$

### Embedding层
单词的表示层叫作Embedding层, 负责把单词编码为某个词向量𝒗

$$v = f_{\theta}(i|N_{vocab}, n)$$
单词数量记为$N_{vocab}$, $v的长度为n$, $i$表示单词编号, 如2 表示“I”，3 表示“me”等.

In [None]:
x = tf.range(10)  # 代表10个不同单词的编码

x = tf.random.shuffle(x)
# 10个单词, 每个单词用长度4 的向量表示
net = layers.Embedding(10, 4)
out = net(x)
out

In [None]:
net.get_weights()

### 预训练的词向量

应用的比较广泛的预训练模型:Word2Vec 和GloVe模型.利用已预训练好的模型参数初始化Embedding层.

In [76]:
def load_embed(path):
    # 建立映射关系: 单词: 词向量(长度50))
    embedding_map = {}
    with open(path, encoding='utf8') as f:
        for line in f.readlines():
            l = line.split()
            word = l[0]
            coefs = np.asarray(l[1:], dtype='float32')
            embedding_map[word] = coefs
    return embedding_map

In [77]:
embedding_map = load_embed('glove.6B.50d.txt')
print('Found %s word vectors.' % len(embedding_map))

Found 400000 word vectors.


In [None]:
embedding_map['the']

### 20newsgroups 测试

In [None]:
from sklearn import datasets
# 加载20newsgroups数据集
news20 = datasets.fetch_20newsgroups()

In [None]:
news20.keys()

In [None]:
category = news20.target_names  # 一共20类不同的新闻
category

In [None]:
labels = news20['target']  # 每条新闻分属的类别

In [None]:
len(news20['data'])

In [None]:
news20['data'][0], category[news20['target'][0]]

In [None]:
MAX_NUM_WORDS = 20000  # 最多保留 20000-1 个不同的单词
MAX_SEQUENCE_LENGTH = 1000  # 每个序列长度
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 50  # 用50维向量表示一个单词

In [None]:
Tokenizer?  # 令牌化

In [None]:
# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)  #  Only the most common `num_words-1` words will be kept.

In [None]:
# Updates internal vocabulary based on a list of texts
tokenizer.fit_on_texts(news20['data'])
sequences = tokenizer.texts_to_sequences(news20['data'])  # 语句 -> 单词序列号组成的sequences

In [None]:
# matrix = tokenizer.texts_to_matrix(news20['data'])
# matrix.shape  # (11314, 20000)  稀疏矩阵

In [None]:
sequences[0]

In [None]:
# 将sequences 转成文本list
# tokenizer.sequences_to_texts(sequences)

In [None]:
# 将单词映射为 index
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
word_index_list = list(word_index.items())

In [None]:
# 从1开始编码 用0代表填充
word_index_list[:10]  # news20group 出现频率最高的10个单词

In [None]:
word_index_list[19998]

In [None]:
# Pads sequences to the same length.
pad_sequences?

In [None]:
# 每条新闻都被编码成 等长的 用数字表示的 序列
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
data.shape

In [None]:
np.max(data), np.min(data) # 

In [None]:
from sklearn.model_selection import train_test_split

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=VALIDATION_SPLIT, random_state=0) 

In [None]:
X_train.shape, y_test.shape

In [None]:
# 将 单词序号-> 单词向量(长度50)
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

applied_vec_count = 0
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    # 根据glove.6B.50d 将单词转为词向量
    embedding_vector = embedding_map.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        applied_vec_count += 1
print(applied_vec_count, embedding_matrix.shape)

In [None]:
# new20group中最常用的19999 词向量 + 填充 + unknow
embedding_matrix.shape

In [None]:
embedding_matrix[-1]

In [None]:
layers.Embedding?

In [None]:
embedding_layer = layers.Embedding(
    num_words, EMBEDDING_DIM,
    weights = [embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

In [None]:
sequence_input = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
embedded_sequences = embedding_layer(sequence_input)
# 使用卷积
x = layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation='relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)
preds = layers.Dense(len(category), activation='softmax')(x)

model = Model(inputs=sequence_input, outputs=preds)

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
hist = model.fit(X_train, y_train, batch_size=128, epochs=15, validation_data=(X_test, y_test))

In [None]:
plt.plot(np.linspace(1, 15, 15), hist.history['loss'], label='loss')
plt.plot(np.linspace(1, 15, 15), hist.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
plt.plot(np.linspace(1, 15, 15), hist.history['accuracy'], label='accuracy')
plt.plot(np.linspace(1, 15, 15), hist.history['val_accuracy'], label='val_accuracy')
plt.legend()

## 循环神经网络


$$h^{(t)} = \sigma(Ux^{(t)} + Wh^{(t-1)} + b)$$
在每个时间戳$t$, 网络层接受当前时间戳的输入$x^{(t)}$和上一个时间戳的网络状态向量$h^{(t-1)}$,经过
$$h^{(t)} = f_{\theta}(h^{(t-1)}, x^{(t)})$$
变换后得到当前时间戳的新状态向量$h^{(t)}$. 在每个时间戳上, 网络层均有输出$o^{(t)} = g_{\phi}(h^{(t)})$

对于这种网络结构，我们把它叫做循环网络结构(Recurrent Neural Network，简称RNN)。

在循环神经网络中，激活函数更多地采用tanh 函数.并且可以选择不使用偏执𝒃来进一步减少参数量。

状态向量$h^{(t)}$可以直接用作输出，即$o^{(t)} = h^{(t)}$，也可以对$t$做一个简单的线性变换.
![](./images/rnnbp.png)

## 梯度传播

参数$W_{hh}$的梯度计算, RNN的损失也是会随着时间累加的，所以不能只求t时刻的偏导。
$$
\frac{\partial L^{(t)}}{\partial V}=\frac{\partial L^{(t)}}{\partial o^{(t)}}\cdot \frac{\partial o^{(t)}}{\partial V}
$$
$$
L=\sum_{t=1}^{n}L^{(t)}
$$
$$
\frac{\partial L}{\partial V}=\sum_{t=1}^{n}\frac{\partial L^{(t)}}{\partial o^{(t)}}\cdot \frac{\partial o^{(t)}}{\partial V}
$$
对于$W和U$, 求解需要涉及到历史数据
$$
\frac{\partial L^{(t)}}{\partial W}=\sum_{k=0}^{t}\frac{\partial L^{(t)}}{\partial o^{(t)}}\frac{\partial o^{(t)}}{\partial h^{(t)}}(\prod_{j=k+1}^{t}\frac{\partial h^{(j)}}{\partial h^{(j-1)}})\frac{\partial h^{(k)}}{\partial W}
$$
$$
\frac{\partial L^{(t)}}{\partial U}=\sum_{k=0}^{t}\frac{\partial L^{(t)}}{\partial o^{(t)}}\frac{\partial o^{(t)}}{\partial h^{(t)}}(\prod_{j=k+1}^{t}\frac{\partial h^{(j)}}{\partial h^{(j-1)}})\frac{\partial h^{(k)}}{\partial U}
$$
其中
$$
\frac {\partial h^{(k)}}{\partial W} = \frac {\partial \sigma(Ux^{(k)} + Wh^{(k-1)} +b)}{\partial W}
$$
只考虑一个时间戳的梯度传播, 即"直接"偏导数.

$$
\frac {\partial h^{(k+1)}}{\partial h^{(k)}}
= W^T diag( \sigma'(Ux^{(k+1)} + Wh^{(k)} + b))
$$

整体的偏导公式就是将其按时刻再一一加起来。

在某个时刻的对$W$或是$U$的偏导数，需要追溯这个时刻之前所有时刻的信息, 整体的偏导公式就是将其按时刻再一一加起来。

公式中包含雅克比矩阵和$W$的连乘运算, 容易出现梯度消失(激活函数使用sigmoid或tanh时)或梯度爆炸(使用ReLU)


<!--
$$
\frac {\partial h_t}{\partial h_i} = 
\frac {\partial h_t}{\partial h_{t-1}}
\frac {\partial h_{t-1}}{\partial h_{t-2}}
\cdots
\frac {\partial h_{i+1}}{\partial h_i}
= \prod_{k=i}^{t-1}\frac {\partial h_{k+1}}{\partial h_{k}} 
$$
$$
\frac {\partial h_t}{\partial h_i} = \prod_{j=i}^{t-1}diag(\sigma'(W_{xh}x_{j+1} + W_{hh}h_j + b))W_{hh}
$$
-->


[循环神经网络(RNN)模型与前向反向传播算法](https://www.cnblogs.com/pinard/p/6509630.html)

[LSTM模型与前向反向传播算法](https://www.cnblogs.com/pinard/p/6519110.html)

In [75]:
class MyRNNCell:
    """
    一个时间戳的前向运算
    """
    def __init__(self, units, activation='tanh', labels=2, random_state=0):
        self.units = units
        self.activation = activation
        self.labels = labels
        self.random_state = random_state
        self.U = None  # [n_h, n_x]
        self.V = None  # [n_o, n_h]
        self.W = None  # [n_h, n_h]
        self.b = None  # [n_h, 1]
        self.c = None  # [n_o, 1]
        self._built = False
        self.y_hat = None
    
    @property
    def built(self):
        return self._built
    
    @built.setter
    def built(self, value):
        self._built = value
        
    def build(self, input_shape):
        _, n_x = input_shape
        n_h, n_o = self.units, self.labels
        np.random.seed(self.random_state)
        self.W = np.random.randn(n_h, n_h)
        self.U = np.random.randn(n_h, n_x)
        self.V = np.random.randn(n_o, n_h)
        self.b = np.zeros([n_h, 1])
        self.c = np.zeros([n_o, 1])
        
    def softmax(self, o):
        ex = np.exp(o - np.max(o))
        y_hat = ex / np.sum(ex, axis=0)
        return y_hat
    
    def loss(self, y_true, y_hat):
        """
        某一时刻的损失值
        使用softmax函数作为o的激活函数
        y_hat: [n_o, b]
        """
        loss = np.sum(-y_true * np.log(y_hat), axis=0)  # [1, b] 各样本的损失
        return np.mean(loss)

    
    def activate(self, z):
        if self.activation == 'tanh':
            return np.tanh(z)
        if self.activation == 'sigmoid':
            return 1/ (1 + np.exp(-z))
    
    def __call__(self, *args, **kwargs):
        output = self.call(*args, **kwargs)
        return output
        
    def call(self, xt, ht_1):
        """
        xt: [b, n_x(word_vec_len)]
        ht_1:［n_h(units), b］
        """
        if not self.built:
            self.build(xt.shape)
            self.build = True
        xt = xt.T
        z = self.U @ xt + self.W @ ht_1 + self.b
        ht = self.activate(z)      
        o = self.V @ ht + self.c
        y_hat = self.softmax(o)
        # loss = self.loss(y)
        self.y_hat = y_hat
        return o, ht

In [72]:
xt = np.random.randn(10, 100)
h0 =np.random.randn(10, 64)

my_cell = MyRNNCell(64, labels=2)

o, h1 = my_cell(xt, h0.T)

In [None]:
class MyRNN:
    def __init__(self, units, activation='tanh', labels=2, random_state=0):
        self.units = units
        self.activation = activation
        self.labels = labels
        self.random_state = random_state
        self.state0 = None
        self.rnn_cell = MyRNNCell(units, activation, labels, random_state)
        self.losses = []
        self.y_hats = []
    
    def build(self, input_shape):
        _, times, n_x = input_shape
        self.rnn_cell.build((None, n_x))
    
    def __call__(self, *args, **kwargs):
        output = self.call(*args, **kwargs)
        return output  
    
    def call(self, inputs):
        if isinstance(inputs, tuple):
            X, y = inputs
        else:
            X = inputs
        b, length, _ = X.shape
        h = np.zero([self.units, b])
        
        for t in range(length):
            out, h = self.rnn_cell(X[:, t, :], h)
    
    def softmax(self, o):
        ex = np.exp(o - np.max(o))
        y_hat = ex / np.sum(ex, axis=0)
        return y_hat
    
    def loss(self, y_true, y_hat):
        """
        某一时刻的损失值
        使用softmax函数作为o的激活函数
        o: [n_o, b]
        y_true: [n_o, b]
        """
        loss = np.sum(-y_true * np.log(y_hat), axis=0)  # [1, b] 各样本的损失
        return np.mean(loss)
    
    def feed_forward(self, X, y):
        b, length, _ = X.shape
        h = np.zero([self.units, b])
        for t in range(length):
            out, h = self.rnn_cell(X[:, t, :], h)
            y_hat = self.rnn_cell.y_hat
            self.y_hats.append(y_hat)
            loss = self.loss(y.T, y_hat)
            self.losses.append(loss)
        return out, h
    
    def backward(self, X, y):
        dl_dv 
        time = len(self.losses)
        for i in reversed(time):
            loss = self.losses[i]
            y_hat = self.y_hats[i]
            dl_do_t =  y - y_hat  # dl^t/do^t = y^t - y_hat^t
            dl_dv += dl_do_t * h_t.T  # dL/dV = \sum_t dL/do^t  do^t/dV
            dh_dz_t = 1 - h_t ** 2  # z= Ux^t+ Wh^t-1 + b,  h^t = tanh(z)
            dh_dW_t = dh_dz_t * h_t_1.T
    
    def fit(self, X, y, epoch=20):
        pass

In [37]:
ht1.shape, o.shape

((64, 10), (10, 2))

## RNN层的使用

- SimpleRNNCell: 完成了一个时间戳的前向运算($\sigma(W_{xh}x_t + W_{hh}h_{t-1} +b)$)
- SimpleRNN: 基于Cell 层实现的，它在内部已经完成了多个时间戳的循环运算，

### SimpleRNNCell

In [21]:
layers.SimpleRNNCell?

In [38]:
cell = layers.SimpleRNNCell(3)  # 内存向量h长度 3
# cell.build(input_shape=(None, 4))  # 输入x特征长度4
# cell.trainable_variables  # W_xh ,  W_hh, b

前向运算
$$o_t, [h_t] = Cell(x_t, [h_{t-1})$$

In [41]:
# 初始化状态向量，用列表包裹，统一格式
h0 = [tf.zeros([4, 64])]

# (b, word_num, word_vec_length)
x = tf.random.normal([4, 80, 100])
xt = x[:, 0, :]  # 所有句子的第一个单词

cell = layers.SimpleRNNCell(64)
out, h1 = cell(xt, h0)  # h1用list包裹, out1没有经过变换 = h1

In [42]:
out.shape, h1[0].shape

(TensorShape([4, 64]), TensorShape([4, 64]))

In [None]:
print(id(out), id(h1[0]))  # 状态向量直接作为输出向量

In [None]:
h = h0
for x_t in tf.unstack(x, axis=1):  # 时间维度解开, 按时间输入单词
    out, h = cell(x_t, h)
out = out  # 只取最后时间戳的输出  N->1

In [None]:
# 2层循环神经网络
x = tf.random.normal([4, 80, 100])
xt = x[:, 0, :]
cell0 = layers.SimpleRNNCell(64)
cell1 = layers.SimpleRNNCell(64)
# 2个cell的初始状态
h0 = [tf.zeros((4, 64))]
h1 = [tf.zeros((4, 64))]

# 一个时间戳上完成2层传播在到下一个时间戳
for xt in tf.unstack(x, axis=1):
    out0, h0 = cell0(xt, h0)
    
    out1, h1 = cell1(out0, h1)

In [None]:
# 先完成第一层所有时间的传播再完成第二层所有时间的传播
middle_seqences = []

for xt in tf.unstack(x, axis=1):
    out0, h0 = cell0(xt, h0)
    middle_seqences.append(out0)

for xt in middle_seqences:
    out1, h1 = cell1(xt, h1)

### SimpleRNN

In [None]:
# SimpleRNN  完成多个时间戳的计算
layer = layers.SimpleRNN(64)
x = tf.random.normal([4, 80, 100])
out = layer(x)
out.shape

In [None]:
# 返回所有时间戳上的输出
layer = layers.SimpleRNN(64, return_sequences=True)
out = layer(x)
out.shape

In [None]:
# 多层RNN网络
net = Sequential([
    # 除最末层外，都需要返回所有时间戳的输出，用作下一层的输入
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(64)
])

In [None]:
out = net(x)
out.shape

## RNN情感分类
imdb评分>7 为1 positive; IMDB 评级<5 的用户评价标注为0 

利用第2 层RNN 层的最后时间戳的状态向量h, 作为句子的全局语义特征表示, 送入全连接分类网络

In [78]:
BATCH_SIZE = 128
TOTAL_WORDS = 10000  # 词汇表大小
MAX_REVIEW_LEN = 80  # 句子长度
EMBEDDING_LEN = 100  # 词向量长度

In [None]:
datasets.imdb.load_data?

In [79]:
# imdb数据集

(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(
    num_words=TOTAL_WORDS)

In [80]:
print(X_train.shape, len(X_train[0]), y_train.shape)  # X 不等长的list 组成的array

(25000,) 218 (25000,)


In [81]:
print(X_test.shape, len(X_test[0]), y_test.shape)

(25000,) 68 (25000,)


In [82]:
# 编码表
word_index = datasets.imdb.get_word_index()

pre_10 = list(word_index.items())[:10]
for item in pre_10:  
    print(item)  # 单词-数字

('fawn', 34701)
('tsukino', 52006)
('nunnery', 52007)
('sonja', 16816)
('vani', 63951)
('woods', 1408)
('spiders', 16115)
('hanging', 2345)
('woody', 2289)
('trawling', 52008)


In [83]:
print(f'total {len(word_index)} unique words')

total 88584 unique words


In [84]:
# 添加标志位
word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0  # 表示填充
word_index["<START>"] = 1  # 表示起始
word_index["<UNK>"] = 2  # 表示未知单词
word_index["<UNUSED>"] = 3

# 翻转
index_word = dict([(v, k) for k, v in word_index.items()]) 

In [85]:
def decode_review(text):
    # 数字序列 -> 文本
    return ' '.join([index_word.get(i, '?') for i in text])


In [86]:
# 截断 填充 成等长的序列
X_train = pad_sequences(X_train, maxlen=MAX_REVIEW_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_REVIEW_LEN)

In [87]:
decode_review(X_train[0])

"that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all"

In [88]:
decode_review(X_test[0])

"<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss"

In [89]:
train_db = tf.data.Dataset.from_tensor_slices(  # 舍弃最后一组 
    (X_train, y_train)).shuffle(1000).batch(BATCH_SIZE, drop_remainder=True)
test_db = tf.data.Dataset.from_tensor_slices(
    (X_test, y_test)).shuffle(1000).batch(BATCH_SIZE, drop_remainder=True)

In [90]:
sample = next(iter(train_db))
sample[0], sample[1]

(<tf.Tensor: id=208, shape=(128, 80), dtype=int32, numpy=
 array([[   4,   65,  410, ...,   12,  199,  211],
        [2730,   10,   10, ...,   46,    7,  158],
        [6018,    5,    4, ..., 2639, 2361, 2916],
        ...,
        [6857,  949,    4, ...,    5,  901,  128],
        [   0,    0,    0, ...,  292,   17,   73],
        [6233,  699,  102, ...,   46,    7,  158]])>,
 <tf.Tensor: id=209, shape=(128,), dtype=int64, numpy=
 array([1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0], dtype=int64)>)

In [91]:
embedding_map = load_embed('glove.6B.100d.txt')
print('Found %s word vectors.' % len(embedding_map))

Found 400000 word vectors.


In [92]:
# 将 单词序号-> 单词向量(长度50)
num_words = min(TOTAL_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_LEN))

applied_vec_count = 0
for word, i in word_index.items():
    if i >= TOTAL_WORDS:
        continue
    # 根据glove.6B.50d 将单词转为词向量
    embedding_vector = embedding_map.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        applied_vec_count += 1
print(applied_vec_count, embedding_matrix.shape)

9793 (10000, 100)


In [93]:
class MyRNN(Model):
    def __init__(self, units):
        super().__init__()
        # 初始状态向量
        self.state0 = [tf.zeros([BATCH_SIZE, units])]
        self.state1 = [tf.zeros([BATCH_SIZE, units])]
        # 词嵌入层
        self.embedding = layers.Embedding(TOTAL_WORDS, EMBEDDING_LEN,
                                          input_length=MAX_REVIEW_LEN,
#                                           weights=[embedding_matrix],
#                                          trainable=False
                                         )
        # RNNCell
#         self.runcell0 = layers.SimpleRNNCell(units, dropout=0.5)
#         self.runcell1 = layers.SimpleRNNCell(units, dropout=0.5)
        # RNN layer
        self.rnn = Sequential([
            layers.SimpleRNN(units, dropout=0.5, return_sequences=True),
            layers.SimpleRNN(units, dropout=0.5)
        ])
        # 分类层
        self.out_layer = Sequential([
            layers.Dense(32, activation='relu'),
            layers.Dropout(rate=0.5),
            layers.Dense(1, activation='sigmoid')
        ])
        
    
    def call(self, inputs, training=None):
        x = self.embedding(inputs)
        state0, state1 = self.state0, self.state1
#         for word in tf.unstack(x, axis=1):
#             out0, state0 = self.runcell0(word, state0, training)
#             out1, state1 = self.runcell1(out0, state1, training)
        out1 = self.rnn(x)
        # 最末层 最后一个时间戳的输出
        out = self.out_layer(out1, training)
        return out

In [None]:
model = MyRNN(64)
model.compile(optimizer=tf.keras.optimizers.Adam(10e-3),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'],
#              experimental_run_tf_function=False  # 以cell方式运行需要设置
             )  

In [None]:
model.build((None, MAX_REVIEW_LEN))

In [None]:
model.summary()

In [None]:
model.fit(train_db, epochs=10, validation_data=test_db)

## 梯度弥散和梯度爆炸
梯度下降
$$\theta := \theta - \eta\nabla_{\theta} L$$

- 梯度弥散(Gradient Vanishing): $\nabla_{\theta} L \approx 0$, 每次梯度更新后参数基本保持不变, ℒ几乎保持不变，其它评测指标，如准确度，也保持不变
- 梯度爆炸(Gradient Exploding): $\nabla_{\theta} L \gg 1$, 梯度更新的步长很大, 更新后的$\theta$变化很大, L出现突变现象，甚至可能出现来回震荡、不收敛的现象

In [None]:
W = tf.ones([2, 2])
eigenvalues = tf.linalg.eigh(W)[0]  # 获取特征值
eigenvalues

In [None]:
# 多次连乘
val = [W]
for _ in range(10):
    val.append(val[-1]@W)

# L2范数
norm = list(map(lambda x:tf.norm(x).numpy(), val))

In [None]:
plt.plot(norm)
plt.xlabel('n times')
plt.ylabel('L2-norm')
# Gradient Exploding

In [None]:
W = tf.ones([2, 2]) * 0.4
eigenvalues = tf.linalg.eigh(W)[0]  # 获取特征值
# 多次连乘
val = [W]
for _ in range(10):
    val.append(val[-1]@W)

# L2范数
norm = list(map(lambda x:tf.norm(x).numpy(), val))
plt.plot(norm)
plt.xlabel('n times')
plt.ylabel('L2-norm')
# Gradient Vanishing

### 梯度裁剪(Gradient Clipping)

梯度爆炸可以通过梯度裁剪(Gradient Clipping)的方式在一定程度上的解决

1. 简单裁剪, 直接对张量的数值进行限幅

In [None]:


a = tf.random.uniform([2, 2])
a

In [None]:
tf.clip_by_value(a, 0.4, 0.6)

2. 限制梯度张量W的范数
$$W' = \frac {W}{||W||_2} \cdot max$$

In [None]:
a = tf.random.uniform([2, 2]) * 5
a

In [None]:
b = tf.clip_by_norm(a, 5)
b

In [None]:
tf.norm(a), tf.norm(b)

3. 全局范数裁剪, 考虑所有参数的梯度的范数, 等比例缩放

$$global\_norm = \sqrt{\sum_i ||W^{i}||^2_2}$$

$$W^{(i)} = \frac {W^{(i)} \cdot max\_norm}{max(global\_norm, max\_norm)}$$

In [None]:
w1 = tf.random.normal([3, 3])
w2 = tf.random.normal([3, 3])
global_norm = tf.sqrt(tf.norm(w1) ** 2 + tf.norm(w2) ** 2)
global_norm

In [None]:
tf.clip_by_global_norm?

In [None]:
(ww1, ww2), global_norm = tf.clip_by_global_norm([w1, w2], 2)  # 总范数限制为2

In [None]:
global_norm

In [None]:
ww1, ww2

In [None]:
global_norm2 = tf.sqrt(tf.norm(ww1) ** 2 + tf.norm(ww2) ** 2)
global_norm2

在网络训练时，梯度裁剪一般在计算出梯度后，梯度更新之前进行

### 处理梯度弥散
对于梯度弥散现象，可以通过增大学习率、减少网络深度、添加 Skip Connection 等一系列的措施抑.

减少网络深度可以减轻梯度弥散现象, 但会影响表达能力.

使用深度残差网络