In [1]:
from utils import tokenize, load_corpus
import numpy as np

#### 加载数据

In [2]:
import pandas as pd
train_data = load_corpus("weibo2018/train.txt")
test_data = load_corpus("weibo2018/test.txt")
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.512 seconds.
Prefix dict has been built succesfully.


In [3]:
train_df.head()

Unnamed: 0,content,sentiment
0,"[书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...",1
1,"[这是, 英超, 被, 黑, 的, 最惨, 的, 一次, [二哈], [二哈], 十几年来,...",0
2,"[中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...",1
3,"[看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...",1
4,"[汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...",1


加载停用词

In [4]:
stopwords = []
with open("stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

TfIdf

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
tfidf = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
tfidf_fit = tfidf.fit_transform(data_str)

加载之前训练好的FastText模型

In [6]:
from gensim.models import FastText
model = FastText.load("model/model_100.txt")

最多只保留Tf-Idf最高的前多少个词

In [7]:
key_words = 20

#### 用每个词的Tfidf作为权重, 对FastText词向量进行加权, 得到表征每个句子的向量

In [8]:
X_train, y_train = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.expand_dims(np.mean(X, axis=0), 0)
        X_train.append(X)
        y_train.append(y)

  import sys
  


In [9]:
X_test, y_test = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.expand_dims(np.mean(X, axis=0), 0)
        X_test.append(X)
        y_test.append(y)

  import sys
  


### 神经网络

In [10]:
import tensorflow as tf
batch_size = 1000
lr = 1e-3
X = tf.placeholder(shape=(batch_size, 100), dtype=tf.float32, name="X")
y = tf.placeholder(shape=(batch_size, 1), dtype=np.float32, name="y")
with tf.variable_scope("fcn", reuse=tf.AUTO_REUSE):
    W1 = tf.get_variable("W1", shape=(100, 50))
    b1 = tf.get_variable("b1", shape=(50,))
    W2 = tf.get_variable("W2", shape=(50, 1))
    b2 = tf.get_variable("b2", shape=(1,))
    fcn1 = tf.nn.xw_plus_b(X, W1, b1)
    logists = tf.nn.xw_plus_b(fcn1, W2, b2)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logists, labels=y))
    op = tf.train.AdamOptimizer(lr).minimize(loss)

  from ._conv import register_converters as _register_converters


In [11]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
config = tf.ConfigProto(gpu_options=gpu_options)
sess = tf.Session(config=config)

In [12]:
total_step = 1001
step = 0
cursor = 0
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=1)
while step < total_step:
    _X, _y = X_train[cursor: cursor + batch_size], y_train[cursor: cursor + batch_size]
    cursor += batch_size
    if len(_X) < batch_size:
        cursor = batch_size - len(_X)
        _X += X_train[: cursor]
        _y += y_train[: cursor]
    _X = np.concatenate(_X)
    _y = np.reshape(np.array(_y, dtype=np.float32), (batch_size, 1))
    _, l = sess.run([op, loss], feed_dict={X: _X, y: _y})
    if step % 100 == 0:
        print("step:", step, " loss:", l)
        saver.save(sess,'model/nn/model', global_step=step)
    step += 1

step: 0  loss: 0.9794157
step: 100  loss: 0.50017077
step: 200  loss: 0.48402408
step: 300  loss: 0.49401006
step: 400  loss: 0.48483914
step: 500  loss: 0.47343573
step: 600  loss: 0.47297615
step: 700  loss: 0.46698982
step: 800  loss: 0.46584013
step: 900  loss: 0.4559707
step: 1000  loss: 0.4664424


In [13]:
_X = np.concatenate(X_test + [np.zeros_like(X_test[0])] * (batch_size - len(X_test)))

In [14]:
result = sess.run(tf.nn.sigmoid(logists), feed_dict={X: _X})
prediction = []
for i in result[:len(X_test)]:
    if i > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [15]:
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print("准确率:", metrics.accuracy_score(y_test, prediction))

             precision    recall  f1-score   support

          0       0.68      0.74      0.71       155
          1       0.88      0.84      0.86       344

avg / total       0.82      0.81      0.81       499

准确率: 0.8096192384769539
