# 导入库

In [1]:
import pandas as pd
import numpy as np
from snownlp import SnowNLP
from snownlp import sentiment
from zhon.hanzi import punctuation
import string
import re
import jieba
import copy
from gensim.corpora import Dictionary
from pprint import pprint
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import warnings
import dataframe_image as dfi
warnings.filterwarnings("ignore")

# 定义景区名称、加载停用词

In [2]:
name = ['第一海水浴场', '青岛水族馆', '青岛海底世界', '青岛海昌极地海洋公园', '青岛海泉湾海洋之星摩天轮',
        '黄岛金沙滩', '石老人海水浴场', '第二海水浴场', '第三海水浴场', '琴岛之眼摩天轮', '栈桥', '青岛奥帆中心', '小青岛', '琅琊台风景区']
sandbeach = ['第一海水浴场', '黄岛金沙滩', '石老人海水浴场', '第二海水浴场', '第三海水浴场']
aquarium = ['青岛水族馆', '青岛海底世界', '青岛海昌极地海洋公园']
skywheel = ['青岛海泉湾海洋之星摩天轮', '琴岛之眼摩天轮']
park = ['栈桥', '青岛奥帆中心', '小青岛', '琅琊台风景区']

In [1]:
with open('stopwords.txt', 'r', newline='', encoding='utf-8') as txtfile:
    stopwords = txtfile.read().split('\r\n')

# 贝叶斯分类模型训练

In [4]:
def load_corpus(path):
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = processing(content)
            data.append((content, int(seniment)))
    return data


def processing(text):
    text = re.sub("\{%.+?%\}", " ", text)
    text = re.sub("@.+?( |$)", " ", text)
    text = re.sub("【.+?】", " ", text)
    text = re.sub("\u200b", " ", text)
    words = [w for w in jieba.lcut(text) if w.isalpha()]
    while "不" in words:
        index = words.index("不")
        if index == len(words) - 1:
            break
        words[index: index+2] = ["".join(words[index: index+2])]
    result = " ".join(words)
    return result

In [5]:
TRAIN_PATH = "train.txt"
train_data = load_corpus(TRAIN_PATH)
df_train = pd.DataFrame(train_data, columns=["words", "label"])
df_train.head()
vectorizer = CountVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
vectorizer = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
X_train = vectorizer.fit_transform(df_train["words"])
y_train = df_train["label"]
clf = MultinomialNB()
clf.fit(X_train, y_train)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ASUS\AppData\Local\Temp\jieba.cache
Loading model cost 0.583 seconds.
Prefix dict has been built successfully.


MultinomialNB()

# 评论读取以及标签

In [6]:
comments = []
score = []
for index, place in enumerate(sandbeach):
    path = 'clean_data/'+place+'new_.csv'
    data = pd.read_csv(path)
    for i in range(len(data['评论'])):
        comments.append(data['评论'][i])
        score.append(SnowNLP(data['评论'][i]).sentiments)

In [7]:
def process(comments):
    pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
    punctuation_list = punctuation+string.punctuation+" "
    for pun in punctuation_list:
        comments = comments.replace(pun, '')
    comments = re.sub(pattern, ' ', comments)
    seg_list_exact = jieba.lcut(comments, cut_all=False)
    return ' '.join(seg_list_exact)

In [8]:
words = [process(s) for s in comments]
vec = vectorizer.transform(words)
grade = clf.predict(vec)

In [9]:
df = pd.DataFrame({'comments': comments, 'sentiments': score, 'grade': grade})
#df = pd.DataFrame({'comments': comments, 'grade': grade})
df

Unnamed: 0,comments,sentiments,grade
0,不错接着期待还带孩子来,9.828904e-01,1
1,好,6.558628e-01,1
2,方便快捷，实惠,8.213507e-01,0
3,还要押金十块。。对象不知道押金的事，洗完出门的时候看前面女的把钥匙给门口大姨，她也把钥匙给门...,1.053449e-05,0
4,服务不好！环境不好！洗澡水太冷！,4.819800e-03,0
...,...,...,...
3107,这里的海水比较干净适合洗海澡。,5.837657e-01,1
3108,千万别再对面的街上吃饭，宰客很严重，一斤半的鱼能称重3斤。一个馒头两块,2.332884e-07,1
3109,春光乍好，和恋人一起去一个向往已久的地方，抛开工作和学业，开启无忧无虑的嗨嗨嗨模式吧～\n这...,1.000000e+00,1
3110,最近几天去过，很肮脏！垃圾遍布沙滩，人屎狗屎很多！搭的大小帐篷到处都是！整个浴场弥漫着垃圾的...,6.031790e-02,0


In [10]:
#dfi.export(df,'1.png',max_rows=10)

In [11]:
def convert_grades(score):
    if score <= 0.5:
        return 0
    else:
        return 1

In [12]:
df['labels'] = df['sentiments'].apply(convert_grades)
df

Unnamed: 0,comments,sentiments,grade,labels
0,不错接着期待还带孩子来,9.828904e-01,1,1
1,好,6.558628e-01,1,1
2,方便快捷，实惠,8.213507e-01,0,1
3,还要押金十块。。对象不知道押金的事，洗完出门的时候看前面女的把钥匙给门口大姨，她也把钥匙给门...,1.053449e-05,0,0
4,服务不好！环境不好！洗澡水太冷！,4.819800e-03,0,0
...,...,...,...,...
3107,这里的海水比较干净适合洗海澡。,5.837657e-01,1,1
3108,千万别再对面的街上吃饭，宰客很严重，一斤半的鱼能称重3斤。一个馒头两块,2.332884e-07,1,0
3109,春光乍好，和恋人一起去一个向往已久的地方，抛开工作和学业，开启无忧无虑的嗨嗨嗨模式吧～\n这...,1.000000e+00,1,1
3110,最近几天去过，很肮脏！垃圾遍布沙滩，人屎狗屎很多！搭的大小帐篷到处都是！整个浴场弥漫着垃圾的...,6.031790e-02,0,0


In [13]:
df['grade'].groupby(df['grade']).count()

grade
0     255
1    2857
Name: grade, dtype: int64

# 评论分类

In [14]:
df_postive = copy.deepcopy(df[df['labels'] == 1])
df_negative = copy.deepcopy(df[df['labels'] == 0])

In [15]:
df_postive = copy.deepcopy(df[df['grade'] == 1])
df_negative = copy.deepcopy(df[df['grade'] == 0])

In [16]:
df_postive.head()

Unnamed: 0,comments,sentiments,grade,labels
0,不错接着期待还带孩子来,0.98289,1,1
1,好,0.655863,1,1
5,99号小伙子，手法很好，超酸爽。过瘾，支持！,0.997289,1,1
6,强烈推荐。多运动 多睡觉 少吃零食 后脑勺不要冒泡 [笑哈哈]嗯😊我最喜欢三月的风，四月的雨...,0.996967,1,1
7,其实已经过了很久了美团还是让评价，很不错哟，跟闺蜜玩得很开心，是个好地方，就是要做好防晒哟，推荐～,0.999643,1,1


In [17]:
comments_postive = df_postive['comments'].tolist()
comments_negative = df_negative['comments'].tolist()

In [18]:
# 输入 comments：['xcbak','hxail sfv ssf','xanil dqa']
def commens_processing(comments):
    pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')  # 定义正则表达式匹配模式
    punctuation_list = punctuation+string.punctuation+" "
    com = []
    for index, value in enumerate(comments):
        for pun in punctuation_list:  # 去掉标点
            value = value.replace(pun, '')
        value = re.sub(pattern, '', value)  # 将符合模式的字符去除
        seg_list_exact = jieba.lcut(value, cut_all=False)  # 精确模式分词
        comments_seg = []
        for word in seg_list_exact:
            if word not in stopwords:
                comments_seg.append(word)
        if comments_seg:
            com.append(comments_seg)
    return com

# LDA主题模型

In [19]:
def lda(text, topic=10):
    dictionary = Dictionary(text)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in text]
    ldamodel = LdaModel(doc_term_matrix, num_topics=5,
                        id2word=dictionary, passes=10)
    pprint(ldamodel.print_topics(num_topics=5, num_words=10))

In [20]:
text = commens_processing(comments_postive)
lda(text)

[(0,
  '0.030*"海水浴场" + 0.021*"浴场" + 0.012*"八大关" + 0.011*"石老人" + 0.010*"沙滩" + '
  '0.009*"第一" + 0.008*"太平" + 0.008*"海边" + 0.007*"地方" + 0.006*"第三"'),
 (1,
  '0.033*"沙滩" + 0.022*"金沙滩" + 0.016*"沙子" + 0.009*"吹" + 0.008*"孩子" + 0.007*"海风" '
  '+ 0.007*"螃蟹" + 0.007*"细腻" + 0.006*"海边" + 0.006*"沙质"'),
 (2,
  '0.028*"不错" + 0.027*"沙滩" + 0.027*"海水" + 0.024*"沙子" + 0.023*"干净" + '
  '0.013*"海水浴场" + 0.013*"金沙滩" + 0.012*"特别" + 0.011*"海边" + 0.010*"地方"'),
 (3,
  '0.019*"海水浴场" + 0.012*"八大关" + 0.011*"石老人" + 0.011*"路" + 0.008*"石楼" + '
  '0.007*"海边" + 0.007*"海滩" + 0.007*"拍照" + 0.006*"花" + 0.005*"度假"'),
 (4,
  '0.026*"海水浴场" + 0.009*"八大关" + 0.008*"夏季" + 0.006*"不错" + 0.006*"相邻" + '
  '0.006*"游泳" + 0.006*"环境" + 0.005*"浴场" + 0.005*"别墅区" + 0.005*"季节"')]


In [21]:
text = commens_processing(comments_negative)
lda(text)

[(0,
  '0.011*"浴场" + 0.010*"说" + 0.008*"垃圾" + 0.008*"押金" + 0.007*"游客" + 0.007*"行" + '
  '0.007*"沙子" + 0.007*"感觉" + 0.007*"孩子" + 0.007*"凉"'),
 (1,
  '0.011*"点" + 0.009*"服务态度" + 0.008*"不好" + 0.008*"沙滩" + 0.007*"垃圾" + '
  '0.006*"金沙滩" + 0.006*"地方" + 0.006*"孩子" + 0.006*"一会" + 0.006*"环境"'),
 (2,
  '0.011*"感觉" + 0.009*"孩子" + 0.008*"凉水" + 0.007*"不好" + 0.007*"海水" + 0.007*"元" '
  '+ 0.007*"海边" + 0.006*"失望" + 0.006*"沙滩" + 0.005*"说"'),
 (3,
  '0.021*"收费" + 0.011*"说" + 0.011*"沙滩" + 0.011*"地方" + 0.009*"浴场" + '
  '0.008*"海水浴场" + 0.008*"游泳" + 0.007*"元" + 0.005*"失望" + 0.005*"垃圾"'),
 (4,
  '0.020*"说" + 0.013*"不好" + 0.010*"态度" + 0.008*"钱" + 0.008*"收费" + 0.007*"地方" + '
  '0.007*"沙滩" + 0.006*"贵" + 0.006*"坐" + 0.006*"洗澡"')]
