In [1]:
from utils import tokenize, load_corpus
import numpy as np
import os

#### 加载全部不同主题的语料

In [2]:
import pandas as pd
import glob
data = {}
for f in glob.glob("weibo2018/topics/*.txt"):
    topic = os.path.split(f)[-1].split(".")[0]
    data[topic] = load_corpus(f)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/dy/xjy0y7v97js5x1bghby2fnkm0000gn/T/jieba.cache
Loading model cost 0.667 seconds.
Prefix dict has been built succesfully.


加载停用词

In [3]:
stopwords = []
with open("stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

加载之前训练好的FastText模型


In [4]:
from gensim.models import FastText
model = FastText.load("model/model_100.txt")

#### 为保证输入神经网络的向量长度一致, 要对长度不足max_length的句子用零向量补齐, 对长度超过max_length的句子进行截断

In [5]:
max_length = 128

In [6]:
data_X, data_length = {}, {}

for topic, corpus in data.items():
    _data_X, _data_length = [], []
    for content, sentiment in corpus:
        X = []
        for w in content[:max_length]:
            if w in model:
                X.append(np.expand_dims(model[w], 0))
        if X:
            length = len(X)
            X = X + [np.zeros_like(X[0])] * (max_length - length)
            X = np.concatenate(X)
            X = np.expand_dims(X, 0)
            _data_X.append(X)
            _data_length.append(length)
    data_X[topic] = _data_X
    data_length[topic] = _data_length

  
  if __name__ == '__main__':


### Attention+LSTM

In [7]:
import tensorflow as tf
from tensorflow.contrib import rnn, seq2seq
batch_size = 100
lr = 1e-3
hidden_size = 100

X = tf.placeholder(shape=(batch_size, max_length, 100), dtype=tf.float32, name="X")
L = tf.placeholder(shape=(batch_size), dtype=np.int32, name="L")
y = tf.placeholder(shape=(batch_size, 1), dtype=np.float32, name="y")
dropout = tf.placeholder(shape=(), dtype=np.float32, name="dropout")
with tf.variable_scope("lstm", reuse=tf.AUTO_REUSE):
    def lstm_cell(hidden_size, cell_id=0):
        # LSTM细胞生成器
        cell = rnn.LSTMCell(hidden_size, reuse=tf.AUTO_REUSE, name='cell%d' % cell_id)
        cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout)
        return cell
    
    context = tf.get_variable("context", shape=(1, hidden_size))
    context = tf.tile(context, [batch_size, 1])
    fw_cell = lstm_cell(hidden_size, 0)
    bw_cell = lstm_cell(hidden_size, 1)
    fw_zero = fw_cell.zero_state(batch_size, tf.float32)
    bw_zero = fw_cell.zero_state(batch_size, tf.float32)
    encoder_output, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                         cell_bw=bw_cell,
                                                         inputs=X,
                                                         sequence_length=L,
                                                         initial_state_fw=fw_zero,
                                                         initial_state_bw=bw_zero,
                                                         dtype=tf.float32)
    attention_context = tf.concat(encoder_output, axis=2)
    attention_mech = seq2seq.BahdanauAttention(hidden_size * 2,
                                                 memory=attention_context,
                                                 memory_sequence_length=L,
                                                 name="AttentionMechanism")
    attention_cell = seq2seq.AttentionWrapper(cell=lstm_cell(hidden_size, 2),
                                                attention_mechanism=attention_mech,
                                                attention_layer_size=hidden_size,
                                                alignment_history=True,
                                                output_attention=True,
                                                name="AttentionCell")
    attention_zero = attention_cell.zero_state(batch_size, tf.float32)
    attention_output, attention_state = attention_cell.call(context, attention_zero)
    aligments = attention_state[3]
    
    W1 = tf.get_variable("W1", shape=(hidden_size, 50))
    b1 = tf.get_variable("b1", shape=(50,))
    W2 = tf.get_variable("W2", shape=(50, 1))
    b2 = tf.get_variable("b2", shape=(1,))
    fcn1 = tf.nn.xw_plus_b(attention_output, W1, b1)
    logists = tf.nn.xw_plus_b(fcn1, W2, b2)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logists, labels=y))
    op = tf.train.AdamOptimizer(lr).minimize(loss)

  from ._conv import register_converters as _register_converters


Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead


In [8]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
config = tf.ConfigProto(gpu_options=gpu_options)
sess = tf.Session(config=config)

#### 加载模型

In [9]:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
checkPoint = tf.train.get_checkpoint_state("model/attention")
saver.restore(sess, checkPoint.model_checkpoint_path)

INFO:tensorflow:Restoring parameters from model/attention/model-1000


#### 对不同主题的文本进行情感分类

In [17]:
sentiment = {}
for topic in data_X.keys():
    _X = np.concatenate(data_X[topic] + [np.zeros_like(data_X[topic][0])] * (batch_size - len(data_X[topic])))
    _L = np.array(data_length[topic] + [1] * (batch_size - len(data_length[topic])))
    result = sess.run(tf.nn.sigmoid(logists), feed_dict={X: _X, L: _L, dropout:1.})
    prediction = []
    for i in result[:len(data_X[topic])]:
        if i > 0.5:
            prediction.append(1)
        else:
            prediction.append(0)
    sentiment[topic] = prediction

In [18]:
for topic, res in sentiment.items():
    print("主题为【%s】的微博中, 正面:%d, 负面:%d" % (topic, res.count(1), res.count(0)))

主题为【同济大学】的微博中, 正面:88, 负面:12
主题为【周杰伦】的微博中, 正面:88, 负面:12
主题为【好莱坞】的微博中, 正面:79, 负面:21
主题为【人工智能】的微博中, 正面:79, 负面:21
主题为【特朗普】的微博中, 正面:55, 负面:44
主题为【毕业】的微博中, 正面:78, 负面:22
