# 使用长短时记忆模型（LSTM）做情感分析

In [23]:
# 引入Python包，__future__ 包是为了扩展Python当前版本对代码的兼容性
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [24]:
# 引入所需要的包
import numpy as np
import tensorflow as tf
import pickle
import jieba
from random import shuffle

## 第一步 数据预处理

In [25]:
# 预处理停用词
def StopWords():
    with open('train_data/stopwords.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
    stopWords = []
    for line in lines:
        stopWords.append(line.strip())
    return stopWords

In [26]:
# 读取数据，并去除停用词
def read_data_file(filepath):
    wordList = []
    sentence = []
    with open(filepath,'r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        words = list(jieba.cut(line.strip(), cut_all=False))
        for word in words:
            # 如果词汇不属于停用词，则把词放到wordList中
            if word not in stopWords:
                wordList.append(word)
        sentence.append(wordList)
        wordList = []
    return sentence

In [27]:
words = read_data_file('./train_data/pos.txt')
print(words)

[['学习', '使', '工作', '轻松', '特多', '原来', '看', '数据', '头疼', '现在', '轻松', '搞定', '全是', '朱', '老师', '功劳'], ['很', '棒棒'], ['老师', '好', '厉害', '不', '懂', '第一', '时间', '帮', '解决', '课程内容', '很', '详细', '不会', '函数', '童鞋', '赶紧', '学', '起来'], ['朱', '老师', '讲课', '太', '满意'], ['很', '好', '很', '实用', '课程'], ['课程', '非常', '棒', '老师', '讲', '非常', '仔细', '逻辑', '非常', '强', '建议', '大家', '听听', '老师', '课', '!', '!', '!'], ['细微', '精致', '讲课', '风格', '如浴春风', '过程', '点子', '上', '案例', '真级', '棒'], ['讲得', '很', '好', '偶尔', '笑场', '觉得', '语法', '很', '轻松'], ['好', '很', '好', '非常', '好', 'yes'], ['讲', '很棒'], ['很', '感谢', '丁', '老师', '讲解', '讲得', '非常', '棒'], ['丁', '老师', '课讲', '清晰', '易懂', '谢谢您', '录播', '课', '方法', '好', '不', '懂', '知识点', '重新', '学', '一遍', '谢谢'], ['老师', '讲得', '挺', '清楚'], ['很棒', '讲', '很', '好'], ['老师', '很', '逗', '课堂', '很', '有趣', '好多', '知识点', '~'], ['老师', '逗', '鉴定', '完毕', '√'], ['很赞'], ['求', '讲义'], ['陈老师', '讲', '很', '好', '听', '完', '之后', '收货', '很大', '后悔', '之前', '没有', '听', '课', '老师', '挺', '幽默'], ['陈正康', '老师', '讲', '非常', '好'], ['老师', '讲', '好', '最', '喜欢',

In [9]:
# 将数据转换成词向量，返回词向量矩阵
def words2Array(lineList):
    linesArray=[]
    wordsArray=[]
    steps = []
    for line in lineList:
        t = 0
        p = 0
        for i in range(MAX_SIZE):
            # 如果小于MAX_SIZE 则用0.0的矩阵补齐
            if i<len(line):
                try:
                    wordsArray.append(word2vec_model.wv.word_vec(line[i]))
                    p = p + 1
                except KeyError:
                    # 如果在词向量模型中没有找到对应词的向量
                    t = t + 1
                    continue
            else:
               wordsArray.append(np.array([0.0]*vector_size))

        for i in range(t):
            wordsArray.append(np.array([0.0]*vector_size))
        steps.append(p)
        linesArray.append(wordsArray)
        wordsArray = []
    linesArray = np.array(linesArray)
    steps = np.array(steps)
    return linesArray, steps

In [10]:
# 打乱数据集
def convert2Data(posArray, negArray, posStep, negStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(posArray)):
        # 制作数据集 积极的标签为one-hot表示方法 [1,0]
        randIt.append([posArray[i], posStep[i], [1,0]])
    for i in range(len(negArray)):
        # 制作数据集 消极的标签为one-hot表示方法 [0,1]
        randIt.append([negArray[i], negStep[i], [0,1]])
    shuffle(randIt)
    for i in range(len(randIt)):
        data.append(randIt[i][0])
        steps.append(randIt[i][1])
        labels.append(randIt[i][2])
    data = np.array(data)
    steps = np.array(steps)
    return data, steps, labels

## 第二步，构建训练数据

In [11]:
# 构建训练数据集
def construct_train_data(posPath,negPath):
    #获取词汇，返回类型为[[word1,word2...],[word1,word2...],...]
    pos = read_data_file(posPath)
    print("The positive data's length is :",len(pos))
    neg = read_data_file(negPath)
    print("The negative data's length is :",len(neg))
    #将评价数据转换为矩阵，返回类型为array
    posArray, posSteps = words2Array(pos)
    negArray, negSteps = words2Array(neg)
    #将积极数据和消极数据混合在一起打乱，制作数据集
    Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
    return Data, Steps, Labels

In [12]:
# 获取停用词
stopWords = StopWords()
word2vec_path = 'word2vec/word2vec.pkl' # 词向量地址
file = open(word2vec_path, 'rb')
word2vec_model = pickle.load(file )
vector_size=word2vec_model.vector_size # 词向量的维度
MAX_SIZE=25 # 句子的统一长度

In [13]:
# 打印数据,看一下数据的条数
print("In train data:")
trainData, trainSteps, trainLabels = construct_train_data('train_data/pos.txt',
                                              'train_data/neg.txt')
# print(trainData)
print("In test data:")
testData, testSteps, testLabels = construct_train_data('train_data/test_pos.txt',
                                           'train_data/test_neg.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/mj/5d5swnz140s3tt6nmq0x0pnh0000gn/T/jieba.cache


In train data:


Loading model cost 0.912 seconds.
Prefix dict has been built succesfully.


The positive data's length is : 638
The negative data's length is : 294
In test data:
The positive data's length is : 102
The negative data's length is : 17


In [14]:
# 将训练标签变成numpy数组
trainLabels = np.array(trainLabels)

In [15]:
del word2vec_model #删除词向量模型

### 查看各个数据的形状

In [16]:
print("-"*10+'打印数据结果'+"-"*10)
print("训练数据的形状:",trainData.shape)
print("测试数据的形状:",testData.shape)
print("训练数据步数的形状:",trainSteps.shape)
print("测试数据步数的形状:",testSteps.shape)
print("训练标签的形状:",trainLabels.shape)
print("测试标签的形状:",np.array(testLabels).shape)

----------打印数据结果----------
训练数据的形状: (932, 25, 100)
测试数据的形状: (119, 25, 100)
训练数据步数的形状: (932,)
测试数据步数的形状: (119,)
训练标签的形状: (932, 2)
测试标签的形状: (119, 2)


## 第三步 构造神经网络计算图

In [17]:
# 训练参数
num_nodes = 128  # 隐藏神经元的数量
batch_size = 16 # 一次喂给训练器的数据条数
output_size = 2 # 输出的维度

### 构造计算图

In [18]:
# 构造计算图
graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32, shape=(None, MAX_SIZE, vector_size),name='x')
    tf_train_steps = tf.placeholder(tf.int32, shape=(None),name='x_step')
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, output_size))

    # tf_test_dataset = tf.constant(testData, tf.float32,name='input_x')
    # tf_test_steps = tf.constant(testSteps, tf.int32,name='steps')
    tf_test_dataset = tf.placeholder(tf.float32, shape=(None, MAX_SIZE, vector_size), name='input_x')
    tf_test_steps = tf.placeholder(tf.int32, shape=(None), name='steps')

    lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_nodes,
                                             state_is_tuple=True)

    w1 = tf.Variable(tf.truncated_normal([num_nodes, num_nodes // 2], stddev=0.1))
    b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))

    w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
    b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))


    def model(dataset, steps):
        outputs, last_states = tf.nn.dynamic_rnn(cell=lstm_cell,
                                                 dtype=tf.float32,
                                                 sequence_length=steps,
                                                 inputs=dataset)
        hidden = last_states[-1]

        hidden = tf.matmul(hidden, w1) + b1
        logits = tf.matmul(hidden, w2) + b2
        return logits


    train_logits = model(tf_train_dataset, tf_train_steps)

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels,
                                                logits=train_logits))
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

    test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))

    tf.add_to_collection('pred_network', test_prediction) #用于加载模型获取要预测的网络结构

    #保存模型
    saver = tf.train.Saver()
    

    


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [22]:
num_steps = 1000 # 迭代次数
summary_frequency = 200

with tf.Session(graph = graph) as session:
    init_op = tf.global_variables_initializer()
    print('Initialized')
    session.run(init_op)

    mean_loss = 0
    for step in range(num_steps):
        offset = (step * batch_size) % (len(trainLabels)-batch_size)
        feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
                   tf_train_labels:trainLabels[offset:offset + batch_size],
                   tf_train_steps:trainSteps[offset:offset + batch_size]}
        _, l = session.run([optimizer,loss],
                           feed_dict = feed_dict)
        mean_loss += l
        if step >0 and step % summary_frequency == 0:
            mean_loss = mean_loss / summary_frequency
            print("The step is: %d"%(step))
            print("In train data,the loss is:%.4f"%(mean_loss))
            mean_loss = 0
            acrc = 0
            prediction = session.run(test_prediction,feed_dict={tf_test_dataset:testData,tf_test_steps:testSteps})
            for i in range(len(prediction)):
                if prediction[i][testLabels[i].index(1)] > 0.5:
                    acrc = acrc + 1
            print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))
    tf.summary.FileWriter('./logs/New_Summary',session.graph)
    saver.save(session, "model/model-senti")
    print('保存模型成功')

Initialized
The step is: 200
In train data,the loss is:0.5213
In test data,the accuracy is:92.44%
The step is: 400
In train data,the loss is:0.4075
In test data,the accuracy is:92.44%
The step is: 600
In train data,the loss is:0.3670
In test data,the accuracy is:92.44%
The step is: 800
In train data,the loss is:0.3417
In test data,the accuracy is:93.28%
保存模型成功
