In [49]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tourism-data/test/test.xlsx
/kaggle/input/tourism-data/test/result1.csv
/kaggle/input/tourism-data/train/2018-2019.xlsx
/kaggle/input/tourism-data/train/2020-2021.xlsx


In [50]:
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import jieba
import re
%matplotlib inline
warnings.filterwarnings('ignore')

# Task 1

In [None]:
data1 = pd.read_excel('/kaggle/input/tourism-data/train/2018-2019.xlsx', sheet_name=4)
data2 = pd.read_excel('/kaggle/input/tourism-data/train/2020-2021.xlsx', sheet_name=4)
test_data = pd.read_excel('../input/tourism-data/test/test.xlsx')

In [None]:
data1.head()

In [None]:
data2.head()

In [None]:
columns = ['ID', 'Title', 'Date', 'Content']
data1.columns = columns
data2.columns = columns

In [None]:
data = pd.concat([data1, data2], ignore_index=True)
data['Content'] = data['Title'] + '\n' + data['Content']
data.drop(columns=['Title', 'Date'], inplace=True)
data.head()

In [None]:
data.dropna(inplace=True)

In [None]:
test_data.head()

In [None]:
test_data.columns = ['ID', 'Title', 'Content']
test_data['Content'] = test_data['Title'] + '\n' + test_data['Content']
test_data.drop(columns=['Title'], inplace=True)
test_data.head()

In [None]:
print('Train data shape:', data.shape)
print('Test data shape:', test_data.shape)
train_size = data.shape[0]

In [None]:
data = pd.concat([data, test_data], ignore_index=True)
data.shape

In [None]:
def preprocessing(data):
    try:
        lines = data.split()
    except Exception as e:
        print(data)
        return ''
    lines = list(filter(lambda x: x is not '', lines))
    unuse_lis = []
    rule_1 = r'\W'
    compiled_rule_1 = re.compile(rule_1)
    for line in lines:
        no_en_and_da = compiled_rule_1.findall(line)
        no_en_and_da_str = ''.join(no_en_and_da)
        reslis = re.findall(r'^\S', ''.join(re.findall(r'[^\，]', ''.join(re.findall(r'[^\。]', no_en_and_da_str)))))
        unuse_lis.append(reslis)
    syms = []
    for i in unuse_lis:
        for j in i:
            syms.append(j)
    syms = list(set(syms))
    
    def replace_syms(line):
        for sym in syms:
            line = line.replace(sym, '')
        return line
    
    def replace_lem(line):
        a = re.sub(r'\s', '', line)
        b = re.sub(r'\W{2,}', '', a)
        c = re.sub(r'\d', '', b)
        d = re.sub(r' ', '', c)
        d = d.replace('_', '')
        return d
    
    lines = list(map(replace_syms, lines))
    lines = list(map(replace_lem, lines))
    lines = list(filter(lambda x: x not in '1234567890', lines))
    
    res = []
    for line in lines:
        for word in jieba.cut(line):
            if word == '，':
                continue
            res.append(word)
    return ' '.join(res)

In [None]:
preprocessing(data.loc[0, 'Content'])

In [None]:
data['Cut'] = data['Content'].apply(preprocessing)
data.head()

In [None]:
data.shape

## 1.1 By LDA

In [None]:
data['AsList'] = data['Cut'].apply(lambda x: list(filter(lambda x: ' ' not in x, jieba.lcut(x))))
data['AsList'].head()

In [None]:
text_data = data['AsList'].tolist()

In [None]:
from gensim import corpora
from gensim.models import TfidfModel

dictionary = corpora.Dictionary(text_data)
dictionary.filter_n_most_frequent(200)
corpus = [dictionary.doc2bow(text) for text in text_data]

# tfidf = TfidfModel(corpus)
# tfidf.save('task_1_tfidf.model')
# # corpus = tfidf[corpus]

In [None]:
dictionary.save('task_1.dict')

In [None]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 200
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

model.save('task_1.model')  # 将模型保存到硬盘

In [None]:
topic_list = model.print_topics()
topic_list = sorted(topic_list, key=lambda x: x[0])
print(topic_list)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

# from pprint import pprint
# pprint(top_topics)

In [None]:
lda_topics = []
for topic in top_topics:
    topic_list, _ = topic
    lda_topics.append([x[1] for x in topic_list])
lda_topics

In [None]:
test_doc = text_data[23]
print(test_doc)
doc_bow = dictionary.doc2bow(test_doc)
# doc_tfidf = tfidf[doc_bow]
doc_lda = model[doc_bow]
doc_lda

In [None]:
def judge_coherence(doc):
    coh_dict = ['旅游', '活动', '节庆', '特产', '交通', '酒店', '景区', '景点',
                '文创', '文化', '乡村旅游', '民宿', '假日', '假期', '游客', '采摘',
                '赏花', '春游', '踏青', '康养', '公园', '滨海游', '度假', '农家乐',
                '剧本杀', '旅行', '徒步', '工业旅游', '线路', '自驾游', '团队游',
                '攻略', '游记', '包车', '玻璃栈道', '游艇', '高尔夫', '温泉']
    doc_bow = dictionary.doc2bow(test_doc)
#     doc_tfidf = tfidf[doc_bow]
    doc_lda = model[doc_bow]
    topic_idx = [each[0] for each in doc_lda]
    topics = []
    for idx in topic_idx:
        topics.extend(lda_topics[idx - 1])
    
    def judge(topics, coh_dict):
        for x in topics:
            for y in coh_dict:
                if y in x:
                    return True
        return False
    
    return judge(topics, coh_dict)

In [None]:
test_text = text_data[train_size:]
test_res = [judge_coherence(text) for text in test_text]

## 1.2 By TF-IDF and TextRank

In [None]:
import jieba.analyse

def judge_by_tfidf(text):
    coh_dict = ['旅游', '活动', '节庆', '特产', '交通', '酒店', '景区', '景点',
                '文创', '文化', '乡村旅游', '民宿', '假日', '假期', '游客', '采摘',
                '赏花', '春游', '踏青', '康养', '公园', '滨海游', '度假', '农家乐',
                '剧本杀', '旅行', '徒步', '工业旅游', '线路', '自驾游', '团队游',
                '攻略', '游记', '包车', '玻璃栈道', '游艇', '高尔夫', '温泉']
    keywords = jieba.analyse.extract_tags(text, topK=50, withWeight=False)
    for x in keywords:
        for y in coh_dict:
            if y in x:
                return True
    return False

def judge_by_textrank(text):
    coh_dict = ['旅游', '活动', '节庆', '特产', '交通', '酒店', '景区', '景点',
                '文创', '文化', '乡村旅游', '民宿', '假日', '假期', '游客', '采摘',
                '赏花', '春游', '踏青', '康养', '公园', '滨海游', '度假', '农家乐',
                '剧本杀', '旅行', '徒步', '工业旅游', '线路', '自驾游', '团队游',
                '攻略', '游记', '包车', '玻璃栈道', '游艇', '高尔夫', '温泉']
    keywords = jieba.analyse.textrank(text, topK=50, withWeight=False)
    for x in keywords:
        for y in coh_dict:
            if y in x:
                return True
    return False

def judge(text):
    return '相关' if judge_by_tfidf(text) and judge_by_textrank(text) else '不相关'

test_res = [judge(text) for text in data[train_size:]['Cut'].tolist()]

In [None]:
task1_res = pd.DataFrame({'文章ID': test_data['ID'].tolist(), 
                          '分类标签': test_res})
task1_res.to_csv('result1.csv', index=False)

# Task 2

In [60]:
def read_in(path):
    sheet0 = pd.read_excel(path, sheet_name=0)
    sheet1 = pd.read_excel(path, sheet_name=1)
    sheet2 = pd.read_excel(path, sheet_name=2)
    sheet3 = pd.read_excel(path, sheet_name=3)
    sheet0.columns = ['ID', 'City', 'Name', 'RevDate', 'Content', 'CheckDate', 'HouseType']
    sheet0['ID'] = sheet0['ID'].apply(lambda x: '酒店评论-' + str(x))
    sheet0 = sheet0[['ID', 'Content']]
    sheet1.columns = ['ID', 'City', 'Name', 'Date', 'Content']
    sheet1['ID'] = sheet1['ID'].apply(lambda x: '景区评论-' + str(x))
    indices = sheet1[sheet1['Name'].str.contains('湛江|广东海洋大学|南极长城站')].index
    sheet1.drop(index=indices, inplace=True)
    sheet1 = sheet1[['ID', 'Content']]
    sheet2 = sheet2[['游记ID', '正文']]
    sheet2.columns = ['ID', 'Content']
    sheet2['ID'] = sheet2['ID'].apply(lambda x: '游记-' + str(x))
    sheet3 = sheet3[['餐饮评论ID', '评论内容', '标题']]
    sheet3.columns = ['ID', 'Content', 'Title']
    sheet3['Content'] = sheet3['Title'] + '\n' + sheet3['Content']
    sheet3.drop(columns='Title', inplace=True)
    sheet3['ID'] = sheet3['ID'].apply(lambda x: '餐饮评论-' + str(x))
    ret = pd.concat([sheet0, sheet1, sheet2, sheet3], ignore_index=True)
    return ret

In [52]:
data1 = read_in('../input/tourism-data/train/2018-2019.xlsx')
data2 = read_in('../input/tourism-data/train/2020-2021.xlsx')
data = pd.concat([data1, data2], ignore_index=True)
data.head(), data.shape

(          ID                                            Content
 0  酒店评论-1001                                            干净卫生服务好
 1  酒店评论-1002                                           环境可以，干净！
 2  酒店评论-1003  环境不错，房间卫生都很好，生活也很方便，就是隔音效果不理想，有时太吵。我定的优惠价，性价比很...
 3  酒店评论-1004                                    很好.......舒服态度不错
 4  酒店评论-1005                                 #卫生# #设计风格# #酒店餐饮#,
 (9496, 2))

In [53]:
data['Content'] = data['Content'].apply(lambda x: re.sub(r'^.*?\n\d+\-\d+\-\d+.*?\nhttp.*?\n|\n.*?\d+\-\d+\-\d+.*?\nhttp.*?\n|\d+\-\d+\-\d+.*?\nhttp.*?\n| ', '', x))
data['Content'].dropna()
data.head(), data.shape

(          ID                                            Content
 0  酒店评论-1001                                            干净卫生服务好
 1  酒店评论-1002                                           环境可以，干净！
 2  酒店评论-1003  环境不错，房间卫生都很好，生活也很方便，就是隔音效果不理想，有时太吵。我定的优惠价，性价比很...
 3  酒店评论-1004                                    很好.......舒服态度不错
 4  酒店评论-1005                                   #卫生##设计风格##酒店餐饮#,
 (9496, 2))

In [55]:
# !pip install pyhanlp
# !pip install foolnltk
# !pip install tensorflow==1.14

In [56]:
from pyhanlp import *
import fool

data['Content'] = data['Content'].apply(HanLP.convertToSimplifiedChinese)

In [None]:
task2_1 = pd.DataFrame(columns=['ID', 'pID', 'Name'])
cnt = 1

def judge(entity: str):
    if str.__contains__(entity, '国') or str.__contains__(entity, '市') or str.__contains__(entity, '区') or str.__contains__(entity, '州'):
        return False
    if entity in ['茂名', '湛江', '河东', '粤西', '水东']:
        return False
    if entity in ['河北', '山西', '辽宁', '吉林', '黑龙江', 
                  '江苏', '浙江', '安徽', '福建', '江西', 
                  '山东', '河南', '湖北', '湖南', '广东', 
                  '海南', '四川', '贵州', '云南', '陕西', 
                  '甘肃', '青海', '台湾', '内蒙古', '广西', 
                  '西藏', '宁夏', '新疆', '北京', '天津', 
                  '上海', '重庆', '香港', '澳门']:
        return False
    return True

for index, row in data.iterrows():
    ID, text = row['ID'], row['Content']
    res = fool.analysis(text)[1][0]
    if res != []:
        for each in res:
            _, _, type_, entity = each
            if type_ == 'location' and judge(entity):
                entity = entity.strip()
                entity = entity.replace(' ', '')
                entity = entity.replace('\n', '')
                task2_1.loc[cnt - 1, 'ID'] = ID
                task2_1.loc[cnt - 1, 'pID'] = 'ID' + str(cnt)
                task2_1.loc[cnt - 1, 'Name'] = entity
                cnt += 1
task2_1.columns = ['语料ID', '产品ID', '产品名称']
task2_1.to_csv('result2-1.csv', index=False)