## 1.酒店评论情感分析
我们这里解决的问题，是一个具体场景下的性感分析，准确一点说，是我们想借助自然语言处理对文本的情感分类能力，自动对酒店评论数据进行情感分析，进而可以借助情感分析的结果完成酒店的筛选。

## 2. 数据读取


### 2.1工具库

In [1]:
import warnings
warnings.filterwarnings('ignore')
import jieba
import numpy as np
import codecs  ##codecs提供的open方法来指定打开的文件的语言编码，它会在读取的时候自动转换为内部unicode 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud   # 词云包

### 2.2 停用词

In [5]:
stopwords = []
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        stopwords.append(line.strip())

### 2.3 评论数据处理

In [8]:
import os
import sys

def get_content(fullname):
    f = codecs.open(fullname, 'r', encoding='gbk', errors='ignore')
    lines = []
    
    for eachline in f.readlines():
        eachline = eachline.strip()
        
        if eachline:  # 当前行不为空
            lines.append(eachline)
    f.close()
    return lines

# 需处理的数据路径
inp = 'E:\下载\senti_analysis-master\data\ChnSentiCorp_htl_ba_2000'
folders = ['neg', 'pos']
for foldername in folders:
    outp = '1000_' + foldername + '.txt'   # 输出文件
    output = codecs.open(outp, 'w')
    
    rootdir= os.path.join(inp, foldername)
    for each_file in os.listdir(rootdir):
        contents = get_content(os.path.join(rootdir, each_file))
        output.write(''.join(contents)+'\n')
        
    output.close()
        

### 2.4 读取评论数据

In [10]:
def read_file(in_f, sentiment, stopwords, words, sentences):
    with open(in_f, 'r', encoding='gbk') as f:
        for line in f.readlines():
            try:
                segs = jieba.lcut(line.strip())
                # 停用词过滤
                segs = [word for word in segs if word not in stopwords and len(word)>1]
                # 记录词语
                words.extend(segs)
                # 添加（分词评论，情感）的元组
                sentences.append((segs, sentiment))
            except:
                print(line)
                continue
                
# 读取数据
words = []
sentences = []
# 好评数据
sentiment = 1
read_file('1000_pos.txt', 1, stopwords, words, sentences)

# 差评数据
sentiment = 0
read_file('1000_neg.txt', 0, stopwords, words, sentences)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WANGZH~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.060 seconds.
Prefix dict has been built succesfully.


In [11]:
words[:10]

['距离', '川沙', '公路', '较近', '公交', '指示', '蔡陆线', '非常', '麻烦', '建议']

In [12]:
sentences[:2]

[(['距离',
   '川沙',
   '公路',
   '较近',
   '公交',
   '指示',
   '蔡陆线',
   '非常',
   '麻烦',
   '建议',
   '路线',
   '房间',
   '较为简单'],
  1),
 (['商务', '大床', '房间', '很大', '床有', '2M', '整体', '感觉', '经济', '实惠', '不错'], 1)]

## 3.数据分析

In [15]:
words_df = pd.DataFrame({'评论词语':words})
words_stat = words_df.groupby(by=['评论词语'])['评论词语'].agg({'计数':np.size})
words_stat = words_stat.reset_index().sort_values(by=['计数'], ascending=False)
words_stat.head(20)

Unnamed: 0,评论词语,计数
10533,酒店,2741
5629,房间,1899
6779,服务,950
7373,没有,875
2225,入住,764
1235,不错,758
7186,比较,552
2675,前台,509
5489,感觉,498
10930,非常,497


## 4.机器学习解决方案

In [16]:
# 切分数据
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x = [' '.join(sentence) for sentence in x]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.1)

In [17]:
len(x_train)

1800