In [1]:
import pandas as pd
import numpy as np
import jieba
from snownlp import sentiment, SnowNLP
import re
import nlpir
from langconv import Converter
import glob
from utils import get_source_comments_data, read_list, sepSentences, GetWordDict, filter_cutting_sentence
from my_sentiment import cal_score, keyword_tag
from my_sentence_cut import cut_sentences
import itertools
import tqdm

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\rshe11\AppData\Local\Temp\jieba.cache
Loading model cost 0.607 seconds.
Prefix dict has been built successfully.


In [2]:
# snownlp给每条评论计算情感分数
def classify_sentiment(unique_paragraphs):
    positive_paragraphs = []
    negative_paragraphs = []
    neutral_paragraphs = []
    paragraph_sentiment_list = []
    for par in tqdm.tqdm(unique_paragraphs):
        s_par = SnowNLP(par)
        if s_par.sentiments > 0.6:
            positive_paragraphs.append(par)
            s = 'positive'
        elif s_par.sentiments < 0.3:
            negative_paragraphs.append(par)
            s = 'negative'
        else:
            neutral_paragraphs.append(par)
            s = 'neutral'
        paragraph_sentiment_list.append((par, s_par.sentiments, s))
    return positive_paragraphs, negative_paragraphs, neutral_paragraphs, paragraph_sentiment_list

### main

#### 读取原始评论

In [3]:
dataset = get_source_comments_data(is_first_time=False)
print("原始评价数：", len(dataset))
comments_list = dataset["评论内容"].dropna().tolist()
print("原始不为空评论数:", len(comments_list))

原始评价数： 911098
原始不为空评论数: 911057


#### Step1 给每条评论断句

In [4]:
# 简单断句--标点符号转折语断句
sentences = sepSentences(comments_list)
print("断句后的段落数:", len(sentences))
unique_sentences = pd.Series(sentences).drop_duplicates().tolist()
print("断句后的非重复段落数：", len(unique_sentences))

断句后的段落数: 3162954
断句后的非重复段落数： 1189184


In [None]:
# 用断句模型断句
# 1.挑出来一部分符合条件的断句
sentence_to_cut, remain = filter_cutting_sentence(unique_sentences)
pd.Series(remain).to_hdf(r"step_data\sentence_cut\sentence_cut.hdf", key='remain')
pd.Series(sentence_to_cut).to_hdf(r"step_data\sentence_cut\sentence_cut.hdf", key='sentence_to_cut')
# 2.调用断句模型断句
model_path = "断句/nike_comment-master/checkpoint/wwm_conv1d-all-000_modeling_comments.pkl"
bert_name = 'hfl/chinese-bert-wwm-ext'
cut_result = cut_sentences(sentence_to_cut, model_path, bert_name)
cut_result.to_hdf(r"step_data\sentence_cut\sentence_cut.hdf", key='cut_result')
sentence_to_cut = itertools.chain(*cut_result['cut_result'].tolist())
sentence_to_cut = [sentence for sentence in sentence_to_cut if (re.search('[\u4e00-\u9fa5]', sentence) != None) and (len(sentence) > 1)]
remain.extend(sentence_to_cut)         

# 首次运行并保存结果 时间很长
positive, negative, neutral, paragraph_sentiment_list = classify_sentiment(remain)
pd.DataFrame(paragraph_sentiment_list, columns=["text",'senti_score','sentiment']).to_hdf("step_data/snownlp_sentiments_result.hdf", "main", index=False)

#### Step2 snownlp情感判断

In [None]:
# positive, negative, neutral, paragraph_sentiment_list = classify_sentiment(unique_sentences)
# pd.DataFrame(paragraph_sentiment_list, columns=["text",'senti_score','sentiment']).to_hdf("step_data/snownlp_sentiments_result.hdf", "main", index=False)

In [12]:
# 读取结果
paragraph_sentiment_list = pd.read_hdf("step_data/snownlp_sentiments_result.hdf", "main") # snownlp的情感分析结果
positive = paragraph_sentiment_list.query("sentiment=='positive'")
negative = paragraph_sentiment_list.query("sentiment=='negative'")
neutral = paragraph_sentiment_list.query("sentiment=='neutral'")

#### Step3 修正情感判断

In [None]:
# 自己设计的情感分析算法给语句分正负面中性
neg_result, neg_nokeywords = cal_score(negative["text"].to_list(), orisen_tag="负面")
print("中性", len(neg_result))
pos_result, pos_nokeywords = cal_score(positive["text"].to_list(), orisen_tag="正面")
print('正面', len(pos_result))
neu_result, neu_nokeywords = cal_score(neutral["text"].to_list(), orisen_tag="中性")
print('负面', len(neu_result))
result = pd.concat([neg_result, neu_result, pos_result, neg_nokeywords, pos_nokeywords, pos_nokeywords])

In [None]:
# 把复购，人群，场景，精神认同的先筛选出来不进入到聚类了
psbe = result[result['tag_1'].isin(['复购', '人群', '适用场景', '精神认同'])]
psbe.to_excel(r"model_result/sentiment/rebuy_people_situation_endorsement.xlsx", index=False)

In [61]:
normal_result = result[~result['tag_1'].isin(['复购', '人群', '适用场景', '精神认同'])]
my_sentiment_result = normal_result[normal_result["keyword"].notnull()]  
nokeyword_result = normal_result[normal_result["keyword"].isnull()]      # 没有keyword的句子，包括只有comment的

In [65]:
# # 检查自主开发的情感分析和snownlp结果不一样的
# mismatch=nokeyword_result[(nokeyword_result['sentiment'] != nokeyword_result['orisen_tag']) & nokeyword_result['sentiment'].notnull()]
# mismatch.to_excel("check/mismatch_sentiment_v3.xlsx", index=False)

# 只有comment的句子，my sentiment是中性和负面的还用自己的，正面的用snownlp的,但是
# nokeyword_result['sentiment'] = np.where((nokeyword_result['sentiment'] == '正面') & (nokeyword_result['orisen_tag'].isin(['负面','中性'])), '中性', nokeyword_result['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nokeyword_result['sentiment'] = np.where((nokeyword_result['sentiment'] == '正面') & (nokeyword_result['orisen_tag'].isin(['负面','中性'])), '中性', nokeyword_result['sentiment'])


In [66]:
# 保存结果
my_sentiment_result.to_hdf("model_result/sentiment/comments_sentiment_result.hdf", "keyword")  # 有分数的为有keyword或者no_keyword里有情感词的即可以计算分数的
my_sentiment_result.to_csv(r"model_result/sentiment/comments_sentiment_keywords.csv", index=False, encoding='utf-8')
nokeyword_result.to_hdf("model_result/sentiment/comments_sentiment_result.hdf", "no_keyword")
nokeyword_result.to_csv(r"model_result/sentiment/comments_sentiment_no_keywords.csv", index=False, encoding='utf-8')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['sentence', 'segmentation', 'orisen_tag', 'sentiment', 'keyword',
       'tag_1', 'tag_2'],
      dtype='object')]

  pytables.to_hdf(


#### Step4 每类情感生成一个文件，为后续聚类做准备

In [67]:
def filter_useless_neutral_sentences(my_sentiment_result):
    """
        如果一句话已经有正面或者负面，那就没必要再在中性里出现了
    """
    sentence_sentiment_num = my_sentiment_result.groupby(['sentence','sentiment'])['keyword'].count()
    sentence_sentiment_num = sentence_sentiment_num.unstack()
    return set(sentence_sentiment_num[((sentence_sentiment_num['正面'] > 0)|(sentence_sentiment_num['负面'] > 0)) & (sentence_sentiment_num['中性'] > 0)].index)

In [68]:
my_sentiment_result = pd.read_hdf("model_result/sentiment/comments_sentiment_result.hdf", "keyword")
sentiment_en = {'正面': 'positive', '负面': 'negative', '中性':'neutral'}
for sentiment_type in ['正面', '负面', '中性']: 
    sentiments_data = my_sentiment_result[my_sentiment_result["sentiment"] == sentiment_type]
    sentiments_data = sentiments_data["sentence"].unique().tolist()
    print(f"{sentiment_type}的句子数量：{len(sentiments_data)}")
    if sentiment_type == '中性':
        remove_sentences = filter_useless_neutral_sentences(my_sentiment_result)
        sentiments_data = [sentence for sentence in set(sentiments_data) if sentence not in remove_sentences]
    simplified_sentences = [sentence for sentence in sentiments_data if ((len(sentence) <= 50) and (len(sentence) >= 2))]
    with open(f"model_result/sentiment/{sentiment_en[sentiment_type]}_sentence_list.txt", 'w', encoding='utf-8') as f:
        f.write("\n".join(simplified_sentences))
        
nokeyword_result = pd.read_hdf("model_result/sentiment/comments_sentiment_result.hdf", "no_keyword")
print("no keyword")
for sentiment_type in ['正面', '负面', '中性']:
    no_keyword_data = nokeyword_result.query("sentiment == @sentiment_type")
    others = no_keyword_data["sentence"].unique().tolist()
    print(sentiment_type, len(others))
    simplified_sentences = [sentence for sentence in others if ((len(sentence) <= 50) and (len(sentence) >= 2))]
    with open(f"model_result/sentiment/no_keyword_{sentiment_en[sentiment_type]}_sentence_list.txt", 'w', encoding='utf-8') as f:
        f.write("\n".join(simplified_sentences))

正面的句子数量：212047
负面的句子数量：104587
中性的句子数量：80929
no keyword
正面 86210
负面 111211
中性 86566


#####################         end         ##########################
再往下不用运行

In [21]:
# 以下不用运行
print(my_sentiment_result[my_sentiment_result['sentence'].str.len() <= 20].shape)
print(my_sentiment_result[my_sentiment_result['sentence'].str.len() > 20].shape)

(681968, 8)
(23592, 8)


my_sentiment_result[my_sentiment_result['sentence'].str.len() > 20]

In [16]:
with open('long_sentences.txt', 'w', encoding='utf-8') as f:
    for sentence in long_sentence.index.to_list():
        f.write(str(sentence))
        f.write('\n')
f.close()

In [None]:
sentence_sentiment_num2[(sentence_sentiment_num2['正面'] > 0) & (sentence_sentiment_num2['中性'] > 0)]

In [34]:
data = pd.read_hdf("from_Doris/comments_sentiment_result_with_tag_ABCDEF.hdf")