In [2]:
import numpy as np
import pandas as pd
from snownlp import SnowNLP
import re

In [3]:
test_data_paths = ['./data/test_data/0926.xlsx','./data/test_data/1017.xlsx','./data/test_data/1107.xlsx','./data/test_data/1114.xlsx','./data/test_data/1128.xlsx','./data/test_data/1205.xlsx','./data/test_data/1212.xlsx']

In [4]:
def read_and_adjust_sheet(test_data_path):
    # 只读取回帖列
    sheet = pd.read_excel(test_data_path, usecols=[2], header = None)
    sheet = sheet.drop([0])
    sheet = sheet.rename(columns={0: 'replies'})
    
    # 清理数据
    clean_reply = []
    for index,row in sheet.iterrows():
        reply = row.str.cat(sep=',')
        # reply = row[0]
        # 自动回复内容删除  
        auto = '(\s)*回复\s[0-9]*(\s)*#(\s)*的帖子(\n)*'
        reply = re.sub(auto,'',reply)
        clean_reply.append(reply)
    
    clean_replies = np.array(clean_reply)
    sheet.insert(0, 'clean_replies',clean_replies)
    
    # 删除字数小于15字的帖子
    for index,row in sheet.iterrows():
        clean_data = row['clean_replies']
        if(len(clean_data)<15):
            sheet = sheet.drop(index)
        
    return sheet

In [5]:
def predict_sentiment(sheet):
    sentiment = []
    for index,row in sheet.iterrows():
        text = SnowNLP(row[0])
        if(len(row[0])>30):
            summary = "".join(text.summary(1))
            text = SnowNLP(summary)
            # print(summary)
        sentiment.append(text.sentiments)
    return sentiment

In [6]:
def extract_keywords(sheet):
    keywords = []
    for index,row in sheet.iterrows():
        #提取2个关键词
        text = SnowNLP(row[0])
        keywords.append(",".join(text.keywords(2)))
    return keywords

In [7]:
def write(keywords,sentiment,test_data_path):
    # 在表格中加入新列(结果列)
    sentiment_results = np.array(sentiment)
    keywords_results = np.array(keywords)
    sheet.insert(1, 'sentiment', sentiment_results)
    sheet.insert(1, 'keywords',keywords_results)
    
    # 设置写回路径
    root_path = './data/results_v2/'
    suffix = test_data_path[-9:-5]+'_output_v2.xlsx'
    back_path = root_path+suffix
    #print(back_path)
    sheet.to_excel(back_path) 

In [8]:
if __name__ == '__main__':
    for test_data_path in test_data_paths:
        sheet = read_and_adjust_sheet(test_data_path)
        sentiment = predict_sentiment(sheet)
        keyword = extract_keywords(sheet)
        write(keyword,sentiment,test_data_path)


FileNotFoundError: [Errno 2] No such file or directory: './data/test_data/0926.xlsx'

In [None]:
sheet