In [1]:
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np
import jieba
import joblib
import warnings

warnings.filterwarnings("ignore")  # 忽略警告信息

# 加载语料库文件，并导入数据
neg = pd.read_excel('data/neg.xls', header=None)
pos = pd.read_excel('data/pos.xls', header=None)

# 分词处理
word_cut = lambda x: jieba.lcut(str(x))
pos['words'] = pos[0].apply(word_cut)
neg['words'] = neg[0].apply(word_cut)

# 使用 1 表示积极情绪，0 表示消极情绪，并完成数组拼接
x = np.concatenate((pos['words'], neg['words']))
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# 二、Word2vec处理
# 训练 Word2Vec 浅层神经网络模型
w2v = Word2Vec(vector_size=300, min_count=10)
w2v.build_vocab(x)
w2v.train(x, total_examples=w2v.corpus_count, epochs=20)

# 对每个句子的词向量进行均值计算
def average_vec(text):
    vec = np.zeros(300).reshape((1, 300))
    for word in text:
        try:
            vec += w2v.wv[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 将词向量保存为 Ndarray
x_vec = np.concatenate([average_vec(z) for z in x])

# 保存 Word2Vec 模型及词向量
w2v.save('w2v_model.pkl')

# 三、训练支持向量机情绪分类模型
# 划分训练集和验证集
x_train, x_val, y_train, y_val = train_test_split(x_vec, y, test_size=0.2, random_state=42)

# 构建支持向量机分类模型
model = SVC(kernel='rbf', verbose=True)
model.fit(x_train, y_train)

# 保存训练好的模型
joblib.dump(model, 'svm_model.pkl')

# 四、模型评估
# 在验证集上计算指标
y_pred = model.predict(x_val)

# 计算各项指标
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1 = f1_score(y_val, y_pred, average='binary')

# 输出模型评估结果
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# 五、情感预测
# 读取 Word2Vec 并对新输入进行词向量计算
def average_vec(words):
    w2v = Word2Vec.load('w2v_model.pkl')
    vec = np.zeros(300).reshape((1, 300))
    for word in words:
        try:
            vec += w2v.wv[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 对电影评论进行情感判断
def svm_predict(string):
    words = jieba.lcut(str(string))
    words_vec = average_vec(words)
    model = joblib.load('svm_model.pkl')
    result = model.predict(words_vec)
    if int(result[0]) == 1:
        print(f"{string} [积极]")
        return result[0]
    else:
        print(f"{string} [消极]")
        return result[0]

# 测试预测
string = '还不错，符合需求'
pred_result = svm_predict(string)
print(f"预测结果: {pred_result}")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_w/hfqvgqcd4rj79tq5flh6k91h0000gn/T/jieba.cache
Loading model cost 0.258 seconds.
Prefix dict has been built successfully.


[LibSVM].....*.*
optimization finished, #iter = 6572
obj = -5299.889303, rho = -0.181020
nSV = 6857, nBSV = 5782
Total nSV = 6857
Model Evaluation Metrics:
Accuracy: 0.8882
Precision: 0.9084
Recall: 0.8678
F1 Score: 0.8877

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      2073
         1.0       0.91      0.87      0.89      2149

    accuracy                           0.89      4222
   macro avg       0.89      0.89      0.89      4222
weighted avg       0.89      0.89      0.89      4222

还不错，符合需求 [积极]
预测结果: 1.0
