In [1]:
import numpy as np
import pandas as pd
import keras
import jieba
import re
import csv
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [26]:
# 我们使用tensorflow的keras接口来建模
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.utils import to_categorical

In [3]:
# 使用gensim加载预训练中文分词embedding
embedding = KeyedVectors.load_word2vec_format('sgns.zhihu.bigram-char', binary=False, unicode_errors="ignore")

In [4]:
#embedding维度300
embedding_dim = embedding['中国'].shape[0]

In [5]:
embedding.similarity('中国','美国')

0.5562877

In [6]:
embedding.vocab['中国'].index

51

In [8]:
embedding.most_similar(positive=['中国'], topn=10)

[('中国人', 0.5616261959075928),
 ('美国', 0.5562876462936401),
 ('我国', 0.5315867066383362),
 ('全中国', 0.5306392908096313),
 ('中国茶', 0.5249154567718506),
 ('中国海', 0.5224688053131104),
 ('中国武协', 0.5200954079627991),
 ('外国', 0.5197731256484985),
 ('中国篮球', 0.511111319065094),
 ('日本', 0.5098267793655396)]

In [24]:
#原始数据准备
train_text_orig = []
train_target = []

csv_orig = csv.reader(open('simplifyweibo_4_moods.csv'))
#'喜悦', 1: '愤怒', 2: '厌恶', 3: '低落
#1 表示正向评论，0 表示负csv_orig向评论
next(csv_orig, None)
for line in csv_orig:
    train_text_orig.append(line[1])
    train_target.append(line[0])

train_target = np.array(train_target).astype('int')
print('%d text examples in trainset' %len(train_text_orig))

361744 text examples in trainset


In [28]:
#convert to onehot
train_target = to_categorical(train_target)

In [10]:
#清晰数据
def clean_text(text):
    text = re.sub("<[^>]+>", "", text)
    text = text.replace("&nbsp;", "")
    text = text.replace("\n", "")
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）-]+", "", text)
    text = re.sub("[^0-9A-Za-z\u4e00-\u9fa5]", "", text)
    text = re.sub( "\\(.*?\\)|\\{.*?}|\\[.*?]", "", text)
    return text

def tokenize_text(text):
    words = [w for w in jieba.cut(text)]
    embedding_vectors = []
    for idx, word in enumerate(words):
        try:
            embedding_vectors.append(embedding.vocab[word].index)
        except KeyError:
            embedding_vectors.append(0)
    return embedding_vectors

In [11]:
#tokenize
train_tokens = []
for text in  train_text_orig:
    pure_text = clean_text(text)
    tokens = tokenize_text(pure_text)
    train_tokens.append(tokens)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.944 seconds.
DEBUG:jieba:Loading model cost 0.944 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [12]:
#平均token数
num_tokens = [len(tokens) for tokens in train_tokens]
np.mean(num_tokens)

33.77766320934141

In [13]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

86

In [14]:
# 取tokens的长度为80时，大约 93%的样本被涵盖
# 我们对长度不足的进行padding，超长的进行修剪
max_tokens = 80
np.sum( np.array(num_tokens) < max_tokens ) / len(num_tokens)

0.9301025034278385

In [15]:
# 用来将tokens转换为文本
def reverse_tokens(tokens):
    text = ''
    for i in tokens:
        if i != 0:
            text = text + embedding.index2word[i]
        else:
            text = text + ' '
    return text

reverse_tokens(train_tokens[10])

'回复 了买房子送瓷砖呗昨晚上经过 看到的一个立柱价格应该 可是看了半天也没看明白诉求点是什么'

In [16]:
len(embedding.index2word)

259753

In [17]:
#取100000/259753个词
num_words = 100000

In [18]:
# 进行padding和truncating， 输入的train_tokens是一个list
# 返回的train_pad是一个numpy array
train_pad = pad_sequences(train_tokens, maxlen=max_tokens, padding='pre', truncating='pre')
# 超出五万个词向量的词用0代替
train_pad[ train_pad>=num_words ] = 0
# 可见padding之后前面的tokens全变成0，文本在最后面
train_pad[20]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,  3236,     1, 29302,
         102,   170, 71942,   372,   223,   939,   660,  3185, 36693,
           1,  6409, 48197, 88738,   477, 14927,  3625,  1122,  1342,
         178,    65,     1,  1765,   648,   223,     1,  3454,  3185,
        2387,   110,    14,     0,    32,    41,     1, 23948,     0,
         102,  2387,  1069,  1866,  1315, 10110,     1,  4403],
      dtype=int32)

In [19]:
# 使用259753个词
num_words = 259753
embedding_dim = 300
# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
for i in range(num_words):
    embedding_matrix[i,:] = embedding[embedding.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')
np.array(embedding_matrix).shape

(259753, 300)

In [29]:
# 90%的样本用来训练，剩余10%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

In [21]:
model = Sequential()
model.add(Embedding(num_words,
                   embedding_dim,
                   weights=[embedding_matrix],
                   input_length = max_tokens,
                   trainable = False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(LSTM(units=16, return_sequences=False))
model.add(Dense(4, activation='softmax'))
# 我们使用adam以0.001的learning rate进行优化
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 300)           77925900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 128)           186880    
_________________________________________________________________
lstm_2 (LSTM)                (None, 16)                9280      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 68        
Total params: 78,122,128
Trainable params: 196,228
Non-trainable params: 77,925,900
_________________________________________________________________


In [30]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [31]:
model.fit(X_train, y_train,
          validation_split=0.1, 
          epochs=20,
          batch_size=256)

Train on 293012 samples, validate on 32557 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f28fc053cd0>

In [32]:
model.evaluate(X_test, y_test)
#95%



[0.476165550299653, 0.8339530229568481]

In [33]:
model.save('senti4.h5')

In [42]:
def predict_sentiment(text):
    pure_text = clean_text(text)
    tokens = tokenize_text(pure_text)
    tokens_pad = pad_sequences([tokens], maxlen=max_tokens,
                           padding='pre', truncating='pre')
    # 预测
    result = model.predict(tokens_pad)
    result_text = ['喜悦','愤怒', '厌恶','低落']
    print(result)
    print(result_text[np.argmax(result)])
    return np.argmax(result)

In [43]:
predict_sentiment("品控不好，还没到一个月就坏了")
predict_sentiment("品控不错，挺好的")
predict_sentiment("太开心了")
predict_sentiment("难受啊")
predict_sentiment("谢天牛逼啊")

[[0.21768999 0.31354317 0.23442587 0.23434097]]
愤怒
[[0.87876415 0.10020134 0.01076278 0.01027178]]
喜悦
[[9.8751849e-01 1.1235228e-02 6.2838796e-04 6.1791926e-04]]
喜悦
[[0.0090515  0.19992171 0.3952723  0.39575452]]
低落
[[0.7592637  0.14819947 0.04622012 0.04631674]]
喜悦


0

In [54]:
y_pred = model.predict(X_test)
y_pred = [np.argmax(arr) for arr in y_pred]

In [70]:
ss = 0
for i in range(len(y_pred)):
    if(y_pred[i]==np.argmax(y_test[i])):
        ss+=1
print(ss)
print(len(y_pred))
print(ss/len(y_pred))

22114
36175
0.6113061506565307


In [62]:
misclassified = np.where( y_pred != y_test )
misclassified

  misclassified = np.where( y_pred != y_test )


(array([0]),)

In [None]:
for idx in misclassified:
    print(reverse_tokens(X_test[idx]))
    print('预测的分类', y_pred[idx])
    print('实际的分类', y_actual[idx])
    print('')

In [None]:
np.where(y_pred==1)

In [None]:
reverse_tokens(X_test[7])

In [None]:
predict_sentiment('感谢大力支持赞家中常备红星二锅头拥有时刻好心情的说哈哈')

In [None]:
predict_sentiment('小米业界良心')

In [1]:
#from keras.models import load_model
from keras.models import load_model
model_loaded = load_model('senti3.h5')
model_loaded.summary()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 300)           77925900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 128)           186880    
_________________________________________________________________
lstm_2 (LSTM)                (None, 16)                9280      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 78,122,077
Trainable params: 196,177
Non-trainable params: 77,925,900
_________________________________________________________________


In [11]:
text = '小米实在是太厉害了'
pure_text = clean_text(text)
tokens = tokenize_text(pure_text)
tokens_pad = pad_sequences([tokens], maxlen=max_tokens,
                       padding='pre', truncating='pre')
# 预测

print(tokens_pad)
result = model_loaded.predict(tokens_pad)
print(result)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.616 seconds.
DEBUG:jieba:Loading model cost 0.616 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


NameError: name 'max_tokens' is not defined