In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import jieba
import codecs

In [2]:
def cut(string, stop_words=None):
    """
    分词
    :param string: 待分词的句子
    :return: 分词所得的列表
    """
    words = list(jieba.cut(string.strip()))
    words_return = []
    if stop_words:
        for word in words:
            if word not in stop_words:
                words_return.append(word)
    return words_return

In [3]:
def get_stop_word_list(filename="../data/hlt_stop_words.txt"):
    """
    返回停词表
    :param filename: 停词表位置
    :return: <List> 停词表
    """
    stop_words = []
    with codecs.open(filename, "r", "utf=8") as stop_word_file:
        for line in stop_word_file:
            stop_words.append(line.strip())
    return stop_words

In [4]:
def get_data(filename="../data/train/train.csv"):
    """
    获取训练或测试数据集
    :param filename: 数据位置
    :return: <pandas.DataFrame> 返回一个pandas表
    """
    data = pd.read_csv(filename)
    return data

In [44]:
train_data = get_data("../data/train/train.csv")
test_data = get_data("../data/test_public/test_public.csv")

In [45]:
stop_words = get_stop_word_list()
cut_words_train = []
for ind in train_data.index:
    sentence = train_data.loc[ind, "content"]
    words = cut(sentence, stop_words=stop_words)
    cut_words_train.append(words)
    train_data.loc[ind, "content"] = " ".join(words)
    print("\rProcess: {:5d}/{:5d}".format(ind, train_data.shape[0]-1), end="")

print('\n')

cut_words_test = []
for ind in test_data.index:
    sentence = test_data.loc[ind, "content"]
    words = cut(sentence, stop_words=stop_words)
    cut_words_test.append(words)
    test_data.loc[ind, "content"] = " ".join(words)
    print("\rProcess: {:5d}/{:5d}".format(ind, test_data.shape[0]-1), end="")

Process:  9946/ 9946

Process:  2363/ 2363

In [46]:
categories = ['价格', '内饰', '动力', '外观', '安全性', '操控', '油耗', '空间', '舒适性', '配置']

x_train, x_test, y_train, y_test = train_test_split(train_data['content'].values, train_data['subject'].values, test_size=0.2, random_state=2018)

# lb = LabelEncoder()
y = train_data['subject'].values.ravel()
y_train = y_train.ravel()
y_test = y_test.ravel()

print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test)))

Train/Test split: 7957/1990


In [52]:
""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5, random_state=42)),
#     ('clf', SVC(kernel='rbf', C=1, gamma=0.1)),
])
svm_clf.fit(x_train, y_train)
""" Predict the test dataset using Naive Bayes"""
predicted = svm_clf.predict(x_test)
print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
print(metrics.classification_report(y_test, predicted, target_names=categories))

print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predicted))
print('\n')

SVM correct prediction: 0.6392
             precision    recall  f1-score   support

         价格       0.87      0.78      0.82       241
         内饰       0.59      0.43      0.50       102
         动力       0.51      0.87      0.64       532
         外观       0.70      0.34      0.46        91
        安全性       0.73      0.44      0.55       126
         操控       0.60      0.46      0.52       228
         油耗       0.86      0.68      0.76       225
         空间       0.54      0.43      0.47        87
        舒适性       0.78      0.43      0.55       184
         配置       0.85      0.64      0.73       174

avg / total       0.68      0.64      0.63      1990

Confusion Matrix:
[[188   1  37   1   2   6   0   1   0   5]
 [  2  44  35   4   3   6   0   2   4   2]
 [ 12   4 465   0   4  14  13   6   9   5]
 [  0   5  50  31   0   2   0   3   0   0]
 [  0   0  53   0  56  14   1   0   1   1]
 [  4   6  78   3  10 106   4  10   3   4]
 [  3   2  53   0   0   6 154   4   3   0]
 [  2   2  

In [50]:
predicted

array(['配置', '内饰', '动力', ..., '操控', '配置', '配置'], dtype=object)

In [41]:
""" 10-fold cross vaildation """
clf_s= make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5, random_state=42))
svm_10_fold = cross_val_score(clf_s, train_data['content'].values, train_data['subject'].values, cv=10)
print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold)))

SVM 10-fold correct prediction: 0.8826


In [42]:
test_subject_pred = svm_clf.predict(test_data['content'].values)

In [43]:
test_subject_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
categories = ['-1', '0', '1']
x_train, x_test, y_train, y_test = train_test_split(train_data["content"].values, 
                                                    train_data["sentiment_value"].values, 
                                                    test_size=0.2)

In [33]:
""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5, random_state=42)),
#     ('clf', SVC()),
])
svm_clf.fit(x_train, y_train)
""" Predict the test dataset using Naive Bayes"""
predicted = svm_clf.predict(x_test)
print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
print(metrics.classification_report(y_test, predicted, target_names=categories))

print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predicted))
print('\n')

SVM correct prediction: 0.6668
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00       323
          0       0.67      1.00      0.80      1327
          1       0.00      0.00      0.00       340

avg / total       0.44      0.67      0.53      1990

Confusion Matrix:
[[   0  323    0]
 [   0 1327    0]
 [   0  340    0]]




  'precision', 'predicted', average, warn_for)


In [34]:
""" 10-fold cross vaildation """
clf_s= make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5, random_state=42))
svm_10_fold = cross_val_score(clf_s, train_data['content'].values, train_data['sentiment_value'].values, cv=10)
print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold)))

SVM 10-fold correct prediction: 0.6697


In [35]:
test_sentiment_value_pred = svm_clf.predict(test_data['content'].values)

In [36]:
test_sentiment_value_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
with codecs.open("../output/output.csv", "w", "utf-8") as outfile:
    outfile.write("content_id,subject,sentiment_value,sentiment_word\n")
    cnt = 1
    for content_id, subject, sentiment_value in zip(test_data["content_id"], test_subject_pred, test_sentiment_value_pred):
        outfile.write("{},{},{},\n".format(content_id, subject, sentiment_value))
        print("\rProcess: {}/{}".format(cnt, test_data.index[-1]+1), end="")
        cnt += 1

Process: 1/2900Process: 2/2900Process: 3/2900Process: 4/2900Process: 5/2900Process: 6/2900Process: 7/2900Process: 8/2900Process: 9/2900Process: 10/2900Process: 11/2900Process: 12/2900Process: 13/2900Process: 14/2900Process: 15/2900Process: 16/2900Process: 17/2900Process: 18/2900Process: 19/2900Process: 20/2900Process: 21/2900Process: 22/2900Process: 23/2900Process: 24/2900Process: 25/2900Process: 26/2900Process: 27/2900Process: 28/2900Process: 29/2900Process: 30/2900Process: 31/2900Process: 32/2900Process: 33/2900Process: 34/2900Process: 35/2900Process: 36/2900Process: 37/2900Process: 38/2900Process: 39/2900Process: 40/2900Process: 41/2900Process: 42/2900Process: 43/2900Process: 44/2900Process: 45/2900Process: 46/2900Process: 47/2900Process: 48/2900Process: 49/2900Process: 50/2900Process: 51/2900Process: 52/2900Process: 53/2900Process: 54/2900Process: 55/2900Process: 56/2900Process: 57/2900Process: 58/2900Process: 59/2900Proce

Process: 1158/2900Process: 1159/2900Process: 1160/2900Process: 1161/2900Process: 1162/2900Process: 1163/2900Process: 1164/2900Process: 1165/2900Process: 1166/2900Process: 1167/2900Process: 1168/2900Process: 1169/2900Process: 1170/2900Process: 1171/2900Process: 1172/2900Process: 1173/2900Process: 1174/2900Process: 1175/2900Process: 1176/2900Process: 1177/2900Process: 1178/2900Process: 1179/2900Process: 1180/2900Process: 1181/2900Process: 1182/2900Process: 1183/2900Process: 1184/2900Process: 1185/2900Process: 1186/2900Process: 1187/2900Process: 1188/2900Process: 1189/2900Process: 1190/2900Process: 1191/2900Process: 1192/2900Process: 1193/2900Process: 1194/2900Process: 1195/2900Process: 1196/2900Process: 1197/2900Process: 1198/2900Process: 1199/2900Process: 1200/2900Process: 1201/2900Process: 1202/2900Process: 1203/2900Process: 1204/2900Process: 1205/2900Process: 1206/2900Process: 1207/2900Process: 1208/2900Process: 1209/2900Process: 12