In [1]:
from utils import tokenize, load_corpus
import numpy as np

#### 加载数据

In [2]:
import pandas as pd
train_data = load_corpus("weibo2018/train.txt")
test_data = load_corpus("weibo2018/test.txt")
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.480 seconds.
Prefix dict has been built succesfully.


In [3]:
train_df.head()

Unnamed: 0,content,sentiment
0,"[书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...",1
1,"[这是, 英超, 被, 黑, 的, 最惨, 的, 一次, [二哈], [二哈], 十几年来,...",0
2,"[中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...",1
3,"[看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...",1
4,"[汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...",1


查看训练集正负情感的比例

In [4]:
train_df["sentiment"].value_counts()

1    5496
0    4504
Name: sentiment, dtype: int64

加载停用词

In [5]:
stopwords = []
with open("stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

TfIdf

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
tfidf = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
tfidf_fit = tfidf.fit_transform(data_str)

加载之前训练好的FastText模型

In [7]:
from gensim.models import FastText
model = FastText.load("model/model_100.txt")

最多只保留Tf-Idf最高的前多少个词

In [8]:
key_words = 30

#### 用每个词的Tfidf作为权重, 对FastText词向量进行加权, 得到表征每个句子的向量

In [9]:
X_train, y_train = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_train.append(X)
        y_train.append(y)

  import sys
  


In [10]:
X_test, y_test = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_test.append(X)
        y_test.append(y)

  import sys
  


### SVM

In [11]:
from sklearn import svm
clf = svm.SVC(C=1, class_weight={1: .95, 0: 1.})
clf.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight={1: 0.95, 0: 1.0}, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
result = clf.predict(X_test)

In [13]:
from sklearn import metrics
print(metrics.classification_report(y_test, result))
print("准确率:", metrics.accuracy_score(y_test, result))

             precision    recall  f1-score   support

          0       0.68      0.79      0.73       155
          1       0.90      0.83      0.86       344

avg / total       0.83      0.82      0.82       499

准确率: 0.8196392785571143
