# CCF2018汽车行业用户观点主题及情感识别

## 初始化

### 模块导入

In [52]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
import codecs
import pandas as pd
import pyltp as ltp
import os
import sys

### 全局变量声明

In [21]:
LTP_DATA_DIR = r"D:\NLP\ltp_data"
SEGMENTOR = ltp.Segmentor()
SEG_DIR = os.path.join(LTP_DATA_DIR, "cws.model")
SEGMENTOR.load(SEG_DIR)

PARSER = ltp.Parser()
PARSER_DIR = os.path.join(LTP_DATA_DIR, "parser.model")
PARSER.load(PARSER_DIR)

POSTAGGER = ltp.Postagger()
POSTAGGER_DIR = os.path.join(LTP_DATA_DIR, "pos.model")
POSTAGGER.load(POSTAGGER_DIR)

### 函数定义

In [22]:
def get_stop_word_list(filename="../data/hlt_stop_words.txt"):
    """
    返回停词表
    :param filename: 停词表位置
    :return: <List> 停词表
    """
    stop_words = []
    with codecs.open(filename, "r", "utf=8") as stop_word_file:
        for line in stop_word_file:
            stop_words.append(line.strip())
    return stop_words

In [23]:
def get_data(filename="../data/train/train.csv"):
    """
    获取训练或测试数据集
    :param filename: 数据位置
    :return: <pandas.DataFrame> 返回一个pandas表
    """
    data = pd.read_csv(filename)
    return data

In [24]:
def cut(string, stop_words=None):
    """
    分词
    :param string: 待分词的句子
    :return: 分词所得的列表
    """
    words = list(SEGMENTOR.segment(string))
    words_return = []
    if stop_words:
        for word in words:
            if word not in stop_words:
                words_return.append(word)
    return words_return

In [25]:
def postag(words):
    postags = postagger.postag(words)
    return postags

In [26]:
def _parse(words, postags):
    arcs = PARSER.parse(words, postags)
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    return arcs

def parse(words):
    arcs = PARSER.parse(words, postag(words))
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    return arcs

In [27]:
def resource_release():
    """
    由于使用到了pyltp的切词功能，为了节约资源，将SEGMENTOR全局化，故需在最后释放资源
    :return: None
    """
    SEGMENTOR.release()
    PARSER.release()
    POSTAGGER.release()

### 数据导入

In [28]:
train_data = get_data("../data/train/train.csv")
test_data = get_data("../data/test_public/test_public.csv")

In [29]:
train_data.head()

Unnamed: 0,content_id,content,subject,sentiment_value,sentiment_word
0,vUXizsqexyZVRdFH,因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。,价格,0,影响
1,4QroPd9hNfnCHVt7,四驱价格貌似挺高的，高的可以看齐XC60了，看实车前脸有点违和感。不过大众的车应该不会差。,价格,-1,高
2,QmqJ2AvM5GplaRyz,斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...,价格,1,低
3,KMT1gFJiU4NWrVDn,这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了,价格,-1,有钱任性
4,nVIlGd5yMmc37t1o,17价格忒高，估计也就是14-15左右。,价格,-1,高


In [30]:
test_data.head()

Unnamed: 0,content_id,content
0,XuPwKCnA2fqNh5vm,欧蓝德，价格便宜，森林人太贵啦！
1,2jNbDn85goX3IuPE,楼主什么时候提的车，南昌优惠多少啊
2,hLgEADQ8sUnvGFK9,吉林，2.5优惠20000，送三年九次保养，贴膜
3,nZmM7LQsfr03wUaz,便宜2万的豪华特装，实用配制提升，优惠还给力，确实划算。
4,pwd8MnrthDqLZafe,如果实在想买就等车展期间，优惠2万，我24.98万入的2.5豪


## 数据清洗

从获取到的数据集中我们可以观察出如下的一些特征：
- 一个content_id有可能对应同一条语句，但是主题不同

|content_id|content|subject|sentiment_value|sentiment_word|
|:---------|:------|:------|:-------------:|:------------:|
|03SpF6jYbtHuQZKA|对了只要你能找到合适的修理厂，我这有全套的底盘件，都是森友，友情价，不开网店的哦，实体七千多款摆臂和拉杆类优质产品。|操控|0||
|03SpF6jYbtHuQZKA|对了只要你能找到合适的修理厂，我这有全套的底盘件，都是森友，友情价，不开网店的哦，实体七千多款摆臂和拉杆类优质产品。|价格|0||

- 一个content_id有可能对应同一条语句，但是情感不同

|content_id|content|subject|sentiment_value|sentiment_word|
|:---------|:------|:------|:-------------:|:------------:|
|0AE8JgciVzHkI7Do|CrV的确是比不上森，但发动机稳定性确实高，底盘感受太差了，正常跑十来万公里后轮就吃胎还得换改进型拉杆，底盘太软了|操控|-1|太差|
|0AE8JgciVzHkI7Do|CrV的确是比不上森，但发动机稳定性确实高，底盘感受太差了，正常跑十来万公里后轮就吃胎还得换改进型拉杆，底盘太软了|动力|1|高|

In [31]:
stop_words = get_stop_word_list()
cut_words = []
for ind in train_data.index:
    sentence = train_data.loc[ind, "content"]
    words = cut(sentence, stop_words=stop_words)
    cut_words.append(words)
    train_data.loc[ind, "content"] = " ".join(words)
    print("\rProcess: {:5d}/{:5d}".format(ind, train_data.shape[0]-1), end="")

Process:  9946/ 9946

In [32]:
train_data.head()

Unnamed: 0,content_id,content,subject,sentiment_value,sentiment_word
0,vUXizsqexyZVRdFH,森林人 即将 换代 套 系统 没 必要 装 款 即将 换代 车型 上 肯定 会 影响 价格,价格,0,影响
1,4QroPd9hNfnCHVt7,四驱 价格 貌似 挺 高 高 看齐 XC60 看 实 车 前 脸 点 违 感 大众 车 应该...,价格,-1,高
2,QmqJ2AvM5GplaRyz,斯柯达 说 质量 似乎 大众 好 一点 价格 低 用料 完全 听说 野帝 没 听说 说 车,价格,1,低
3,KMT1gFJiU4NWrVDn,玩意 都 有钱 任性 不 懂 车 土豪 价格 换 次 妹夫 EP020 换 三 锅,价格,-1,有钱任性
4,nVIlGd5yMmc37t1o,17 价格 忒 高 估计 14-15 左右,价格,-1,高


In [33]:
for ind in test_data.index:
    sentence = test_data.loc[ind, "content"]
    test_data.loc[ind, "content"] = " ".join(cut(sentence, stop_words=stop_words))
    print("\rProcess: {:5d}/{:5d}".format(ind, test_data.shape[0]-1), end="")

Process:  2363/ 2363

In [34]:
test_data.head()

Unnamed: 0,content_id,content
0,XuPwKCnA2fqNh5vm,欧蓝德 价格 便宜 森林 人 太 贵
1,2jNbDn85goX3IuPE,楼主 提 车 南昌 优惠
2,hLgEADQ8sUnvGFK9,吉林 2.5 优惠 20000 送 三 年 九 次 保养 贴膜
3,nZmM7LQsfr03wUaz,便宜 2万 豪华 特装 实用 配制 提升 优惠 还 给力 确实 划算
4,pwd8MnrthDqLZafe,实在 想 买 车展 期间 优惠 2万 24.98 万入 2.5豪


## 分类器

本次分类器采用sklearn库中的SVC模型(SVM算法)

In [53]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_encoder = LabelEncoder()
label_encoder.fit(train_data["subject"])
train_labels = label_encoder.transform(train_data["subject"])

count_vect_subject = CountVectorizer()
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(count_vect_subject.fit_transform(train_data["content"].values))

X_train, X_test, y_train, y_test = train_test_split(tfidf.toarray(), train_labels, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(train_data["content"].values, train_labels, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(cut_words, train_labels, test_size=0.2)

test_data_content_tfidf = tfidf_transformer.transform(count_vect_subject.transform(test_data["content"].values))

# vect_subject = TfidfVectorizer()
# train_data_features_subject = vect_subject.fit_transform(X_train)
# test_data_features_subject = vect_subject.transform(X_test)
# test_data_subject = vect_subject.transform(test_data["content"].values)

### 主题分类（subject）

In [54]:
clf_subject = MultinomialNB()
# clf_subject.fit(train_data_features_subject, y_train)
clf_subject.fit(X_train, y_train)

# precision_recall_fscore_support(clf_subject.predict(test_data_features_subject), y_test)
# print(precision_recall_fscore_support(clf_subject.predict(X_test), y_test))
y_NB_subject_pred = clf_subject.predict(X_test)
print(f1_score(y_NB_subject_pred, y_test, average='macro'))
print(np.mean(y_NB_subject_pred == y_test))

0.25161020083
0.405025125628


In [55]:
# test_subject_result = label_encoder.inverse_transform(clf_subject.predict(test_data_subject))
test_subject_result = label_encoder.inverse_transform(y_NB_subject_pred)

In [56]:
set(test_subject_result)

{'价格', '内饰', '动力', '外观', '安全性', '操控', '油耗', '空间', '舒适性', '配置'}

In [57]:
label_encoder.transform(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

array([0, 9, 5, 8, 6, 2, 1, 4, 7, 3], dtype=int64)

In [58]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [62]:
from sklearn.externals import joblib
clf_subject_svm = joblib.load('../output/model/svc_model.joblib')
# clf_subject_svm = SVC(
#     kernel='rbf',
#     C=3,
#     gamma=0.1,
#     class_weight={0: 0.12797828491002311, 9: 0.085754498843872526, 5: 0.10415200562983815, 8: 0.093596059113300489, 6: 0.1087765155323213, 2: 0.27465567507791294, 1: 0.053885593646325523, 4: 0.057605308133105458, 7: 0.044435508193425156, 3: 0.04916055091987534}
# )
# clf_subject_svm.fit(X_train, y_train)
# clf_subject_svm = joblib.dump(clf_subject_svm, '../output/model/svc_model.joblib')

y_SVM_subject_pred = clf_subject_svm.predict(X_test)
print(f1_score(y_SVM_subject_pred, y_test, average='macro'))
print(np.mean(y_SVM_subject_pred == y_test))

0.205525168936
0.404020100503


  'recall', 'true', average, warn_for)


In [63]:
# test_subject_result = label_encoder.inverse_transform(clf_subject_svm.predict(test_data_subject))
test_subject_result = label_encoder.inverse_transform(y_SVM_subject_pred)

### Keras Text-CNN TEST

### 情感分类（sentiment_value）

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data["content"].values, 
                                                    train_data["sentiment_value"].values, 
                                                    test_size=0.2)

vect_sentiment = TfidfVectorizer()
train_data_features_sentiment = vect_sentiment.fit_transform(X_train)
test_data_features_sentiment = vect_sentiment.transform(X_test)
test_data_sentiment = vect_sentiment.transform(test_data["content"].values)

clf_sentiment = MultinomialNB()
clf_sentiment.fit(train_data_features_sentiment, y_train)

precision_recall_fscore_support(clf_sentiment.predict(test_data_features_sentiment), y_test)

import numpy as np
np.mean(clf_sentiment.predict(test_data_features_sentiment) == y_test)

0.68291457286432156

In [65]:
test_sentiment_result = clf_sentiment.predict(test_data_sentiment)

In [66]:
set(test_sentiment_result)

{-1, 0, 1}

## 输出结果

In [67]:
with codecs.open("../data/output.csv", "w", "utf-8") as outfile:
    outfile.write("content_id,subject,sentiment_value,sentiment_word\n")
    cnt = 1
    for content_id, subject, sentiment_value in zip(test_data["content_id"], test_subject_result, test_sentiment_result):
        outfile.write("{},{},{},\n".format(content_id, subject, sentiment_value))
        print("\rProcess: {}/{}".format(cnt, test_data.index[-1]+1), end="")
        cnt += 1

Process: 1/2364Process: 2/2364Process: 3/2364Process: 4/2364Process: 5/2364Process: 6/2364Process: 7/2364Process: 8/2364Process: 9/2364Process: 10/2364Process: 11/2364Process: 12/2364Process: 13/2364Process: 14/2364Process: 15/2364Process: 16/2364Process: 17/2364Process: 18/2364Process: 19/2364Process: 20/2364Process: 21/2364Process: 22/2364Process: 23/2364Process: 24/2364Process: 25/2364Process: 26/2364Process: 27/2364Process: 28/2364Process: 29/2364Process: 30/2364Process: 31/2364Process: 32/2364Process: 33/2364Process: 34/2364Process: 35/2364Process: 36/2364Process: 37/2364Process: 38/2364Process: 39/2364Process: 40/2364Process: 41/2364Process: 42/2364Process: 43/2364Process: 44/2364Process: 45/2364Process: 46/2364Process: 47/2364Process: 48/2364Process: 49/2364Process: 50/2364Process: 51/2364Process: 52/2364Process: 53/2364Process: 54/2364Process: 55/2364Process: 56/2364Process: 57/2364Process: 58/2364Process: 59/2364Proce

Process: 1440/2364Process: 1441/2364Process: 1442/2364Process: 1443/2364Process: 1444/2364Process: 1445/2364Process: 1446/2364Process: 1447/2364Process: 1448/2364Process: 1449/2364Process: 1450/2364Process: 1451/2364Process: 1452/2364Process: 1453/2364Process: 1454/2364Process: 1455/2364Process: 1456/2364Process: 1457/2364Process: 1458/2364Process: 1459/2364Process: 1460/2364Process: 1461/2364Process: 1462/2364Process: 1463/2364Process: 1464/2364Process: 1465/2364Process: 1466/2364Process: 1467/2364Process: 1468/2364Process: 1469/2364Process: 1470/2364Process: 1471/2364Process: 1472/2364Process: 1473/2364Process: 1474/2364Process: 1475/2364Process: 1476/2364Process: 1477/2364Process: 1478/2364Process: 1479/2364Process: 1480/2364Process: 1481/2364Process: 1482/2364Process: 1483/2364Process: 1484/2364Process: 1485/2364Process: 1486/2364Process: 1487/2364Process: 1488/2364Process: 1489/2364Process: 1490/2364Process: 1491/2364Process: 14

In [None]:
cut_words[:50]

In [None]:
train_data.head()