In [59]:
import tensorflow as tf 
from tensorflow.keras import layers,models,Sequential
import tensorflow.keras as keras
import numpy as np

In [60]:
tf.random.set_seed(22)
np.random.seed(22)

In [61]:
max_word_num = 10000
max_review_length = 80  # 句子长度限制为80,少的补齐, 多的截取
(data_train,label_train),(data_test,label_test) = keras.datasets.imdb.load_data(num_words=max_word_num) 
# 设定最多编码成10000个单词, 其他出现频率小的单词用位置的一个单词处理
# 对句子进行一个padding操作, 农场同样长度为80的句子
data_train = keras.preprocessing.sequence.pad_sequences(data_train,maxlen=max_review_length)
data_test = keras.preprocessing.sequence.pad_sequences(data_test,maxlen=max_review_length)


In [62]:
data_train.shape,label_train.shape

((25000, 80), (25000,))

In [30]:
data_test.shape,label_test.shape

((25000, 80), (25000,))

In [54]:
db_train = tf.data.Dataset.from_tensor_slices((data_train,label_train))   
# 这里使用from_tensor_slices函数的时候记得一定要输入一个tuple,而不是list, 否则会报错
db_test = tf.data.Dataset.from_tensor_slices((data_test,label_test))

db_train = db_train.shuffle(1000).batch(512,drop_remainder = True) 
# 这里,drop_remainder = True 表示如果迭代到最后一个batch, 所剩余的数据不够一个batch大小的时候, 直接舍弃最后一个batch
db_test = db_test.batch(512,drop_remainder = True)

batchsz = 512

In [55]:
print(tf.reduce_max(label_train),tf.reduce_min(label__test))

tf.Tensor(1, shape=(), dtype=int64) tf.Tensor(0, shape=(), dtype=int64)


In [75]:
# 自定义一个RNN网络模型类
class MyRNN(models.Model):
    def __init__(self,units,batchsz):
        super(MyRNN,self).__init__(self)
        embedding_length = 100
        max_word_num = 10000
        # 将输入数据转换成embedding形式的数据
        self.embedding = layers.Embedding(max_word_num,embedding_length, input_length = max_review_length)
        # input_length 是句子长度
        # embedding_length 是emdding之后的feature长度
        # max_word_num是最多编码的单词数量
        self.state0 = [tf.zeros([batchsz,units])]
        self.state1 = [tf.zeros([batchsz,units])]
        self.rnn_cell0 = layers.SimpleRNNCell(units,dropout = 0.5)
        # 这里我们只定义一层的RNN层
        self.rnn_cell1 = layers.SimpleRNNCell(units,dropout = 0.5)
        
        
        self.fc = layers.Dense(1) # 单输出节点完成分类问题
        
        
    def call(self,inputs,training = None):  # training 不是指的时候取默认None代表的是训练模式,这个是约定俗成的规则
        x = inputs
        # 此时x的大小是[b,80],80是句子长度
        
        x = self.embedding(x)  # 进行embeding编码
        # 编码之后x的格式是[b,80,100], 我们设置的embeding的feature长度是100
        
        state0 = self.state0
        state1 = self.state1
        for words in tf.unstack(x,axis=1):# 在1维度上对这个数据进行展开
            # 获取了一个batch中的每一句话中的相同位置上的词以后
            out0,state0 = self.rnn_cell0(words,state0)
            out1,state1 = self.rnn_cell1(out0,state1)
            
        
        # 经过全连接层处理
        x = self.fc(out1)
        
        #计算sigmoid操作
        prob = tf.sigmoid(x)
        
        return prob

In [76]:
# 双层RNN模型
def main():
    units = 64
    epochs = 4
    batchsz = 512
    
    model = MyRNN(units,batchsz)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate = 1e-3),
                 loss = tf.losses.BinaryCrossentropy(),
                 metrics = ['accuracy'],
                experimental_run_tf_function=False)  # 不加这一句会有不兼容的情况, 会报错
    model.fit(db_train,epochs=epochs,validation_data=db_test)
    

In [77]:
main()

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
