# RNN  对IMDB 数据集进行情感分析

In [16]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [17]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()

IMDB 数据集是一个二进制分类数据集，所有的评论都被分为正面或负面

In [18]:
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,
                          as_supervised=True)



In [19]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [20]:
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [21]:
# 编码
encoder = info.features['text'].encoder

In [22]:
print('Vocabulary size: {}'.format(encoder.vocab_size))

Vocabulary size: 8185


In [23]:
# example
sample_string = 'Hello TensorFlow.'

encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

original_string = encoder.decode(encoded_string)
print('The original string: "{}"'.format(original_string))

Encoded string is [4025, 222, 6307, 2327, 4043, 2120, 7975]
The original string: "Hello TensorFlow."


In [24]:
assert original_string == sample_string
for index in encoded_string:
  print('{} ----> {}'.format(index, encoder.decode([index])))

4025 ----> Hell
222 ----> o 
6307 ----> Ten
2327 ----> sor
4043 ----> Fl
2120 ----> ow
7975 ----> .


# 准备训练数据

In [25]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
# train_dataset = train_dataset.padded_batch(BATCH_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, 
                                           tf.compat.v1.data.get_output_shapes(train_dataset))
# test_dataset = test_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_dataset.padded_batch(BATCH_SIZE,
                                           tf.compat.v1.data.get_output_shapes(test_dataset))

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.summary()

















Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 598,209
Trainable params: 598,209
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
# batch_size=64
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
#                     batch_size=batch_size,
                    validation_steps=30)

Epoch 1/10
     29/Unknown - 26s 909ms/step - loss: 0.6933 - accuracy: 0.5038