# 一、导入数据

In [3]:
# -- coding: utf-8 --
from gensim.models.word2vec  import Word2Vec
from sklearn.model_selection import cross_val_score
from sklearn.svm             import SVC
import pandas       as pd
import numpy        as np
import jieba,joblib
import warnings
warnings.filterwarnings("ignore")             #忽略警告信息

# 加载语料库文件，并导入数据
neg = pd.read_excel('data/neg.xls', header=None)#, index=None
pos = pd.read_excel('data/pos.xls', header=None)#

pos.head()



Unnamed: 0,0
0,做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...
1,作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2,作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3,作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4,作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...


## 分词处理

In [4]:
word_cut = lambda x: jieba.lcut(str(x))
pos['words'] = pos[0].apply(word_cut)
neg['words'] = neg[0].apply(word_cut)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_w/hfqvgqcd4rj79tq5flh6k91h0000gn/T/jieba.cache
Loading model cost 0.257 seconds.
Prefix dict has been built successfully.


In [5]:
# 使用 1 表示积极情绪，0 表示消极情绪，并完成数组拼接
x = np.concatenate((pos['words'], neg['words']))
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# 二、Word2vec处理

In [6]:
# 训练 Word2Vec 浅层神经网络模型
w2v = Word2Vec(vector_size=300,  #是指特征向量的维度，默认为100。
               min_count=10)     #可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。

w2v.build_vocab(x)
w2v.train(x,
          total_examples=w2v.corpus_count,
          epochs=20)

# 对每个句子的词向量进行均值计算
def average_vec(text):
    vec = np.zeros(300).reshape((1, 300))
    for word in text:
        try:
            vec += w2v.wv[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 将词向量保存为 Ndarray
x_vec = np.concatenate([average_vec(z) for z in x])

# 保存 Word2Vec 模型及词向量
w2v.save('w2v_model.pkl')

In [None]:
# 三、训练支持向量机情绪分类模型

In [7]:
model = SVC(kernel='rbf', verbose=True) # 构建支持向量机分类模型
model.fit(x_vec, y) # 训练模型

# 保存训练好的模型
joblib.dump(model, 'svm_model.pkl')

[LibSVM]......*.*
optimization finished, #iter = 7970
obj = -6461.225329, rho = -0.146742
nSV = 8349, nBSV = 7015
Total nSV = 8349


['svm_model.pkl']

In [8]:
# 输出模型交叉验证准确率
print(cross_val_score(model, x_vec, y))

[LibSVM].....*..*
optimization finished, #iter = 7154
obj = -5629.819171, rho = -0.275268
nSV = 7215, nBSV = 6073
Total nSV = 7215
[LibSVM].....*.*
optimization finished, #iter = 6588
obj = -5560.436864, rho = -0.127919
nSV = 6990, nBSV = 6022
Total nSV = 6990
[LibSVM].....*.*
optimization finished, #iter = 6351
obj = -4797.548481, rho = 0.207110
nSV = 6404, nBSV = 5252
Total nSV = 6404
[LibSVM].....*.*.*
optimization finished, #iter = 7108
obj = -5185.404960, rho = -0.119214
nSV = 6820, nBSV = 5675
Total nSV = 6820
[LibSVM].....*.*
optimization finished, #iter = 6316
obj = -4900.886316, rho = -0.341092
nSV = 6425, nBSV = 5331
Total nSV = 6425
[0.91307437 0.89483657 0.80549633 0.84150675 0.79341388]


# 四、情感预测

In [9]:
# 读取 Word2Vec 并对新输入进行词向量计算
def average_vec(words):
    # 读取 Word2Vec 模型
    w2v = Word2Vec.load('w2v_model.pkl')
    vec = np.zeros(300).reshape((1, 300))
    for word in words:
        try:
            vec += w2v.wv[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 对电影评论进行情感判断
def svm_predict(string):

    # 对评论分词
    words = jieba.lcut(str(string))
    words_vec = average_vec(words)
    # 读取支持向量机模型
    model = joblib.load('svm_model.pkl')

    result = model.predict(words_vec)

    # 实时返回积极或消极结果
    if int(result[0]) == 1:
        print(string, '[积极]')
        return result[0]
    else:
        print(string, '[消极]')
        return result[0]



string = '还不错，符合需求'
pred_result = svm_predict(string)
print(pred_result)

还不错，符合需求 [积极]
1.0
