In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

from wordcloud import WordCloud,STOPWORDS
import jieba
import matplotlib.pyplot as plt

- 从CSV文件中导入数据

In [2]:
data = pd.read_csv('dianping.csv')
data.head()

Unnamed: 0,comment,star
0,口味：不知道是我口高了，还是这家真不怎么样。??我感觉口味确实很一般很一般。上菜相当快，我敢...,2
1,菜品丰富质量好，服务也不错！很喜欢！,4
2,说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！,2
3,菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...,5
4,先说我算是娜娜家风荷园开业就一直在这里吃??每次出去回来总想吃一回??有时觉得外面的西式简餐...,4


* 数据预处理：通过star属性，准备sentiment属性

In [3]:
def make_label(star):
    if star > 3:
        return 1
    else:
        return 0
    
data['sentiment'] = data.star.apply(make_label)
data = data[['comment' , 'sentiment']]
data.head()

Unnamed: 0,comment,sentiment
0,口味：不知道是我口高了，还是这家真不怎么样。??我感觉口味确实很一般很一般。上菜相当快，我敢...,0
1,菜品丰富质量好，服务也不错！很喜欢！,1
2,说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！,0
3,菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...,1
4,先说我算是娜娜家风荷园开业就一直在这里吃??每次出去回来总想吃一回??有时觉得外面的西式简餐...,1


* 简单使用snownlp包进行情感分析
  - 优点：简单方便，即调即用
  - 缺点：不结合具体数据集，准确率不高

In [4]:
from snownlp import SnowNLP

text1 = '这个东西难道真的不错吗'
text2 = '这个东西很垃圾'

s1 = SnowNLP(text1)
s2 = SnowNLP(text2)

print(s1.sentiments,s2.sentiments)

#def snow_result(comemnt):
#    s = SnowNLP(comemnt)
#    if s.sentiments >= 0.6:
#        return 1
#    else:
#        return 0
#    
#data['snlp_result'] = data.comment.apply(snow_result)
#data.head()

0.7253682031772586 0.21406279508712744


* 简单版分词

In [5]:
def simple_word_cut (texts):
    return ','.join(jieba.cut(texts, cut_all=False))

data['simple_cut_comment'] = data.comment.apply(simple_word_cut)
data.head()

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/8d/62lh93xj5lv9v1r3f8qg6m6r0000gn/T/jieba.cache
Loading model cost 1.045 seconds.
Prefix dict has been built successfully.


Unnamed: 0,comment,sentiment,simple_cut_comment
0,口味：不知道是我口高了，还是这家真不怎么样。??我感觉口味确实很一般很一般。上菜相当快，我敢...,0,"口味,：,不,知道,是,我口,高,了,，,还是,这家,真,不怎么样,。,?,?,我,感觉,口..."
1,菜品丰富质量好，服务也不错！很喜欢！,1,"菜品,丰富,质量,好,，,服务,也,不错,！,很,喜欢,！"
2,说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！,0,"说真的,，,不,晓得,有人,排队,的,理由,，,香精,香精,香精,香精,，,拜拜,！"
3,菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...,1,"菜量,实惠,，,上菜,还,算,比较,快,，,疙瘩汤,喝出,了,秋日,的,暖意,，,烧茄子,吃..."
4,先说我算是娜娜家风荷园开业就一直在这里吃??每次出去回来总想吃一回??有时觉得外面的西式简餐...,1,"先说,我,算是,娜娜,家风,荷园,开业,就,一直,在,这里,吃,?,?,每次,出去,回来,总..."


In [7]:


def word_cut (texts):
    words_list = []
    word_generator = jieba.cut(texts, cut_all=False)  # 返回的是一个迭代器
    with open('hit_stopwords.txt') as f:
        str_text = f.read()
    for word in word_generator:
        if word.strip() not in str_text:
            words_list.append(word)
            #print ('1')
    return ' '.join(words_list)  # 注意是空格

data['cut_comment'] = data.comment.apply(word_cut)

data.head()

Unnamed: 0,comment,sentiment,simple_cut_comment,cut_comment
0,口味：不知道是我口高了，还是这家真不怎么样。??我感觉口味确实很一般很一般。上菜相当快，我敢...,0,"口味,：,不,知道,是,我口,高,了,，,还是,这家,真,不怎么样,。,?,?,我,感觉,口...",口味 知道 我口 高 这家 不怎么样 感觉 口味 确实 很 很 上菜 相当 快 我敢 菜 都...
1,菜品丰富质量好，服务也不错！很喜欢！,1,"菜品,丰富,质量,好,，,服务,也,不错,！,很,喜欢,！",菜品 丰富 质量 服务 不错 很 喜欢
2,说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！,0,"说真的,，,不,晓得,有人,排队,的,理由,，,香精,香精,香精,香精,，,拜拜,！",说真的 晓得 有人 排队 理由 香精 香精 香精 香精 拜拜
3,菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...,1,"菜量,实惠,，,上菜,还,算,比较,快,，,疙瘩汤,喝出,了,秋日,的,暖意,，,烧茄子,吃...",菜量 实惠 上菜 算 比较 快 喝出 秋日 暖意 烧茄子 吃 出 大阪 烧 味道 想 吃 土...
4,先说我算是娜娜家风荷园开业就一直在这里吃??每次出去回来总想吃一回??有时觉得外面的西式简餐...,1,"先说,我,算是,娜娜,家风,荷园,开业,就,一直,在,这里,吃,?,?,每次,出去,回来,总...",先说 算是 娜娜 家风 荷园 开业 吃 每次 出去 回来 总想 吃 一回 有时 觉得 外面 ...


In [8]:
X = data['cut_comment']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# CountVectorizer

vect = TfidfVectorizer(max_df = 0.8, 
                       min_df = 3, 
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b'
                      )

features = pd.DataFrame(vect.fit_transform(X_train).toarray(), columns=vect.get_feature_names())
features.head()

Unnamed: 0,ipad,ok,ps,一下,一下子,一个个,一个半,一个多,一人,一份,...,麻婆豆腐,麻将,麻烦,麻辣,麻酱,麻麻,黄瓜,黄盖,黄色,齐全
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=5) 

X_train_vect = vect.fit_transform(X_train)
knn.fit(X_train_vect, y_train)

train_accuracy = knn.score(X_train_vect, y_train)
print (train_accuracy)

0.8


In [16]:
X_test_vect = vect.transform(X_test)
test_accuracy = knn.score(X_test_vect, y_test)

y_predict = knn.predict(X_test_vect)

print('测试准确率', test_accuracy)

from sklearn.metrics import classification_report
print("测试集上其他指标：\n",classification_report(y_test, y_predict))

测试准确率 0.67
测试集上其他指标：
               precision    recall  f1-score   support

           0       0.63      0.89      0.73       206
           1       0.79      0.44      0.56       194

    accuracy                           0.67       400
   macro avg       0.71      0.66      0.65       400
weighted avg       0.70      0.67      0.65       400



In [17]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

X_train_vect = vect.fit_transform(X_train)
nb.fit(X_train_vect, y_train)

train_accuracy = nb.score(X_train_vect, y_train)
print (train_accuracy)

0.90625


In [18]:
X_test_vect = vect.transform(X_test)
test_accuracy = nb.score(X_test_vect, y_test)

y_predict = nb.predict(X_test_vect)

print('测试准确率', test_accuracy)

from sklearn.metrics import classification_report
print("测试集上其他指标：\n",classification_report(y_test, y_predict))

测试准确率 0.8425
测试集上其他指标：
               precision    recall  f1-score   support

           0       0.87      0.81      0.84       206
           1       0.81      0.88      0.84       194

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



In [19]:
X_vec = vect.transform(X)
nb_result = nb.predict(X_vec)
data['nb_result'] = nb_result

data.head()

Unnamed: 0,comment,sentiment,simple_cut_comment,cut_comment,nb_result
0,口味：不知道是我口高了，还是这家真不怎么样。??我感觉口味确实很一般很一般。上菜相当快，我敢...,0,"口味,：,不,知道,是,我口,高,了,，,还是,这家,真,不怎么样,。,?,?,我,感觉,口...",口味 知道 我口 高 这家 不怎么样 感觉 口味 确实 很 很 上菜 相当 快 我敢 菜 都...,0
1,菜品丰富质量好，服务也不错！很喜欢！,1,"菜品,丰富,质量,好,，,服务,也,不错,！,很,喜欢,！",菜品 丰富 质量 服务 不错 很 喜欢,1
2,说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！,0,"说真的,，,不,晓得,有人,排队,的,理由,，,香精,香精,香精,香精,，,拜拜,！",说真的 晓得 有人 排队 理由 香精 香精 香精 香精 拜拜,0
3,菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...,1,"菜量,实惠,，,上菜,还,算,比较,快,，,疙瘩汤,喝出,了,秋日,的,暖意,，,烧茄子,吃...",菜量 实惠 上菜 算 比较 快 喝出 秋日 暖意 烧茄子 吃 出 大阪 烧 味道 想 吃 土...,1
4,先说我算是娜娜家风荷园开业就一直在这里吃??每次出去回来总想吃一回??有时觉得外面的西式简餐...,1,"先说,我,算是,娜娜,家风,荷园,开业,就,一直,在,这里,吃,?,?,每次,出去,回来,总...",先说 算是 娜娜 家风 荷园 开业 吃 每次 出去 回来 总想 吃 一回 有时 觉得 外面 ...,0


In [20]:
nb_probs = nb.predict_proba (X_vec)
print (nb_probs[0:5])

[[0.59840531 0.40159469]
 [0.2164415  0.7835585 ]
 [0.79246598 0.20753402]
 [0.13920812 0.86079188]
 [0.63556393 0.36443607]]


In [21]:
import math

def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (math.exp(coef_1), fn_1, math.exp(coef_2), fn_2))

show_most_informative_features(vect, nb)

	0.0002	一副             		0.0162	不错             
	0.0002	一句             		0.0128	味道             
	0.0002	一声             		0.0097	好吃             
	0.0002	一根             		0.0072	环境             
	0.0002	一点儿            		0.0064	喜欢             
	0.0002	一脸             		0.0054	菜品             
	0.0002	一身             		0.0051	口味             
	0.0002	一边             		0.0048	排队             
	0.0002	一锅             		0.0047	很多             
	0.0002	七个             		0.0047	比较             
	0.0002	三口             		0.0044	有点             
	0.0002	三次             		0.0040	没有             
	0.0002	上去             		0.0040	服务             
	0.0002	上吐下泻           		0.0039	感觉             
	0.0002	上当             		0.0038	推荐             
	0.0002	不上             		0.0033	烤肉             
	0.0002	不去             		0.0031	东西             
	0.0002	不差             		0.0030	排骨             
	0.0002	不带             		0.0030	人太多            
	0.0002	不惯             		0.0030	特别             


