# 情感分析
- 利用循环神经网络搭建情感分析模型。循环神经网络RNN使用了文本的序列信息，准确率比前向网络要高。本次搭建的RNN模型如下图：
<img src="images/network_diagram.png" width=400px>
- 将单词传入 embedding层，之所以使用嵌入层，是因为单词数量太多，使用嵌入式方式词向量来表示单词更有效率。在这里我们使用word2vec方式来实现.
- 通过embedding 层, 新的单词表示传入 LSTM cells。这将是一个递归链接网络，所以单词的序列信息会在网络之间传递。
- LSTM cells连接一个sigmoid output layer 。 使用sigmoid可以预测该文本是"positive" 还是"negative"情感。 

## IMDB数据集介绍
IMDB数据集有50000条数据集，训练集有25000条，测试集有25000条。数据label分为正负类，每类各占50%。这里采用keras的数据集中的imdb，每条数据已经做好了编码。label为0表示负类，label为1表示正类。

- 加载tensorflow

In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import time

- 加载数据集

In [2]:
word_nums = 8000
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=word_nums)
print("train data shape: ", x_train.shape)
print("test data shape: ", x_test.shape)

train data shape:  (25000,)
test data shape:  (25000,)


In [3]:
print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [None]:
# # 重新构建训练集与测试集
x_train_new = []
for i in range(len(x_train)):
  x_train_new.append(x_train[i])
for i in range(15000):
  x_train_new.append(x_test[i])
print(len(x_train_new))
x_test_new = []
for i in range(15000, 25000):
  x_test_new.append(x_test[i])
print(len(x_test_new))
y_train_new = []
for i in range(len(y_train)):
  y_train_new.append(y_train[i])
for i in range(15000):
  y_train_new.append(y_test[i])
print(len(y_train_new))
y_test_new = []
for i in range(15000, 25000):
  y_test_new.append(y_test[i])
print(y_test_new[:10])

In [None]:
# #数据集打乱顺序
index = [i for i in range(len(x_train_new))]
random.shuffle(index)
x_train_new = np.array(x_train_new)[index]
y_train_new = np.array(y_train_new)[index]

index = [i for i in range(len(x_test_new))]
random.shuffle(index)
x_test_new = np.array(x_test_new)[index]
y_test_new = np.array(y_test_new)[index]

- 序列长度预处理

In [4]:
max_length = len(max((x_train_new), key=len))
min_length = len(min((x_train_new), key=len))
print("max length: ", max_length)
print("min length: ", min_length)

max length:  2494
min length:  11


- 高阶API介绍<br />
tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre',
    value=0.0)
    - sequences: list of list
    - maxlen: 最大长度，整形，返回最大长度的list
    - padding：填充方式，默认是pre，也可以选择post
    - truncating：截断方式，默认pre
    - value: 默认填充的值为0
    - 返回numpy

In [5]:
max_len = 200
x_train_new = tf.keras.preprocessing.sequence.pad_sequences(x_train_new, maxlen=max_len, padding="pre")
x_test_new = tf.keras.preprocessing.sequence.pad_sequences(x_test_new, maxlen=max_len, padding="pre")
print("x_train shape", x_train.shape)
print("x_test shape: ", x_test.shape)

x_train shape (25000, 300)
x_test shape:  (25000, 300)


- 搭建RNN模型
   - lstm 网络
   <img src="images/lstmcell.jpg" width=600px>
   - lstm的参数量计算
       - lstm输出的大小为m，输入的大小为n，则参数量为4x((m+n)x m + m)

In [7]:
class RNNModel(object):
    def __init__(self, word_nums, embedding_size, max_len, hidden_units):
        self.word_nums = word_nums
        self.embedding_size = embedding_size
        self.max_len = max_len
        self.hidden_units = hidden_units
    
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(self.hidden_units, return_sequences=True))
        model.add(tf.keras.layers.Dropout(0.3))
        model.add(tf.keras.layers.LSTM(self.hidden_units, return_sequences=False))
        model.add(tf.keras.layers.Dropout(0.5))
        # model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(2, activation="sigmoid", kernel_initializer='random_uniform')))
        # model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(2, activation="sigmoid"))
        return model

- 训练模型

In [None]:
import math
epochs = 50
batch_size = 128
embedding_size = 50
hidden_units = 16
if not os.path.exists("./models/rnn/"):
    os.mkdir("./models/rnn/")
if not os.path.exists("./logs/"):
    os.mkdir("./logs/")
log_dir = "./logs/imdbrnn_event-{}".format(int(time.time()))
## 加载rnn模型
model = RNNModel(word_nums, embedding_size, max_len, hidden_units).build_model()
## 加学习率衰减
def lr_decay(epoch):
    initial_lr = 0.01
    drop = 0.5
    epochs_drop = 20
    lr = initial_lr * math.pow(drop, math.floor((1 + epoch)/epochs_drop))
    return lr
#model = model.build_model()
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(),
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=["accuracy"])

my_callbacks = [tf.keras.callbacks.ModelCheckpoint("./model/imdb_rnn_lstm_new.h5"),
         tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)]
         #tf.keras.callbacks.LearningRateScheduler(lr_decay)]

history = model.fit(x_train_new, y_train_new, batch_size=batch_size, epochs=epochs, 
                    callbacks=my_callbacks, validation_split=0.2)

- 可视化

In [None]:
fig1 = plt.figure()
plt.plot(history.history["accuracy"], "b", linewidth=3.0)
plt.plot(history.history["val_accuracy"], "r", linewidth=3.0)
plt.legend(["acc", "val_acc"], fontsize=18)
plt.xlabel("epochs", fontsize=16)
plt.ylabel("accuracy", fontsize=16)
plt.title("accuaacy curve")
plt.savefig("./image/rnn_lstm_accuracy.png")
plt.show()

In [None]:
fig2 = plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["loss", "val_loss"], fontsize=18)
plt.xlabel("epoch", fontsize=16)
plt.ylabel("loss", fontsize=16)
plt.title("loss curve")
plt.savefig("./image/rnn_lstm_loss.png")
plt.show()

- 评价模型

In [None]:
scores = model.evaluate(x_test_new, y_test_new, batch_size, verbose=1)
print("score: ", scores)

- 测试模型

In [None]:
result = model.predict(x_test_new[：3, :])
result = result.tolist()
final_result = []
for i in result:
    if i >= 0.5:
        final_result.append("positive")
    else:
        final_result.append("negative")
print("predict result: ", final_result)

### 搭建RNN模型-- GRU

- gru模型
<img src="images/grucell.png" width=400px>
- gru参数量计算
- tf.keras.layers.GRU(
    units, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
    kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal',
    bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None,
    bias_regularizer=None, activity_regularizer=None, kernel_constraint=None,
    recurrent_constraint=None, bias_constraint=None, dropout=0.0,
    recurrent_dropout=0.0, implementation=2, return_sequences=False,
    return_state=False, go_backwards=False, stateful=False, unroll=False,
    time_major=False, reset_after=True, **kwargs
)

In [None]:
class RNNModel(object):
    def __init__(self, word_nums, embedding_size, max_len, hidden_units):
        self.word_nums = word_nums
        self.embedding_size = embedding_size
        self.max_len = max_len
        self.hidden_units = hidden_units
    
    def build_model(self):
      model = tf.keras.Sequential()
      model.add(tf.keras.layers.Embedding(input_dim=self.word_nums, output_dim=self.embedding_size, input_length=self.max_len))
      model.add(tf.keras.layers.GRU(self.hidden_units, return_sequences=True))
      model.add(tf.keras.layers.Dropout(0.3))
      model.add(tf.keras.layers.GRU(self.hidden_units, return_sequences=False))
      model.add(tf.keras.layers.Dropout(0.5))
      # model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(2, activation="sigmoid", kernel_initializer='random_uniform')))
      # model.add(tf.keras.layers.Flatten())
      model.add(tf.keras.layers.Dense(2, activation="sigmoid"))
      return model

- 训练模型

In [None]:
epochs = 50
batch_size = 256
embedding_size = 30
hidden_units = 16

if not os.path.exists("./models/rnn/"):
    os.mkdir("./models/rnn/")
if not os.path.exists("./logs_gru/"):
    os.mkdir("./logs_gru/")
log_dir = "./logs_gru/imdbrnn_event-{}".format(int(time.time()))
## 加载rnn模型
model = RNNModel(word_nums, embedding_size, max_len, hidden_units).build_model()
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=["accuracy"])

my_callbacks = [tf.keras.callbacks.ModelCheckpoint("./models/rnn/imdb_rnn_gru.h5"),
               tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)]

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=my_callbacks,
                    validation_split=0.2)

In [None]:
fig1 = plt.figure()
plt.plot(history.history["accuracy"], "b", linewidth=3.0)
plt.plot(history.history["val_accuracy"], "r", linewidth=3.0)
plt.legend(["acc", "val_acc"], fontsize=18)
plt.xlabel("epochs", fontsize=16)
plt.ylabel("accuracy", fontsize=16)
plt.title("accuaacy curve")
plt.savefig("./image/rnn_gru_accuracy.png")
plt.show()

In [None]:
fig2 = plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["loss", "val_loss"], fontsize=18)
plt.xlabel("epoch", fontsize=16)
plt.ylabel("loss", fontsize=16)
plt.title("loss curve")
plt.savefig("./image/rnn_gru_loss.png")
plt.show()

In [None]:
scores = model.evaluate(x_test_new, y_test_new, batch_size, verbose=1)
print("score: ", scores)

### 搭建RNN模型-- CuDNNLSTM

In [None]:
class RNNModel(object):
    def __init__(self, word_nums, embedding_size, max_len, hidden_units):
        self.word_nums = word_nums
        self.embedding_size = embedding_size
        self.max_len = max_len
        self.hidden_units = hidden_units
    
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(input_dim=self.word_nums, output_dim=self.embedding_size,
                                           input_length=self.max_len))
        model.add(tf.compat.v1.keras.layers.CuDNNLSTM(self.hidden_units, return_sequences=True))
        model.add(tf.keras.layers.Dropout(0.4))
        model.add(tf.compat.v1.keras.layers.CuDNNLSTM(self.hidden_units, return_sequences=False))
        model.add(tf.keras.layers.Dropout(0.4))
        model.add(tf.keras.layers.Dense(2, activation="sigmoid"))
        return model

In [None]:
epochs = 10
batch_size = 64
embedding_size = 200
hidden_units = 256

if not os.path.exists("./models/rnn/"):
    os.mkdir("./models/rnn/")
if not os.path.exists("./logs_culstm/"):
    os.mkdir("./logs_culstm/")
log_dir = "./logs_culstm/imdbrnn_event-{}".format(int(time.time()))
## 加载rnn模型
model = RNNModel(word_nums, embedding_size, max_len, hidden_units).build_model()
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
             metrics=["accuracy"])

my_callbacks = [tf.keras.callbacks.ModelCheckpoint("./models/rnn/imdb_rnn_culstm.h5"),
               tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)]

history = model.fit(x_train_new, y_train_new, batch_size=batch_size, epochs=epochs, callbacks=my_callbacks,
                    validation_data=(x_test_new, y_test_new))

In [None]:
fig1 = plt.figure()
plt.plot(history.history["accuracy"], "b", linewidth=3.0)
plt.plot(history.history["val_accuracy"], "r", linewidth=3.0)
plt.legend(["acc", "val_acc"], fontsize=18)
plt.xlabel("epochs", fontsize=16)
plt.ylabel("accuracy", fontsize=16)
plt.title("accuaacy curve")
plt.savefig("./image/rnn_culstm_accuracy.png")
plt.show()

In [None]:
fig2 = plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["loss", "val_loss"], fontsize=18)
plt.xlabel("epoch", fontsize=16)
plt.ylabel("loss", fontsize=16)
plt.title("loss curve")
plt.savefig("./image/rnn_culstm_loss.png")
plt.show()

In [None]:
scores = model.evaluate(x_test, y_test, batch_size, verbose=1)
print("score: ", scores)