In [2]:
from collections import defaultdict
import itertools
import pandas as pd

In [85]:
origin_cluster_table = pd.read_excel("step_data/cluster_result_combine_all_sentiments_no_dup_tag4.xlsx")
keywords_table = pd.read_excel(r"model_input\keywords\keywords_sentimental_words.xlsx", sheet_name='情感分析关键词')
keywords_table_add = pd.read_excel(r"model_input\keywords\keywords_sentimental_words.xlsx", sheet_name='关键词补充')

In [86]:
origin_cluster_table['sentiment'] = origin_cluster_table['new_sentiment']
del origin_cluster_table['new_sentiment']

In [87]:
keywords_table = pd.concat([keywords_table, keywords_table_add])
keywords_table = keywords_table.drop_duplicates(subset=['关键字', '一级'])[['关键字', '一级', '二级']]

In [88]:
keywords_table.head()

Unnamed: 0,关键字,一级,二级
0,舒服,舒适,/
1,柔软,舒适,脚感软
2,缓震,性能,缓震
3,透气,舒适,透气
4,减震,性能,缓震


In [11]:
len(keywords_table["关键字"].str[0].unique())  # 关键词的第一个字

0      舒
1      柔
2      缓
3      透
4      减
      ..
154    宽
155    窄
157    贴
158    容
159    轻
Name: 关键字, Length: 500, dtype: object

In [77]:
class KeywordIndex:
    def __init__(self, table):
        self.table = table
        self.index = defaultdict(list)
        for i, ch in enumerate(self.table["关键字"].str[0]):
            self.index[ch].append(i)                             # i,关键词所在的行
        self.index_keys = set(self.index)
    
    def find(self, sentence):
        result = []
        possible_ch = set(sentence) & self.index_keys
        possible_keywords = itertools.chain(*[self.index[ch] for ch in possible_ch]) # 可能会存在的关键词所在的行号
        for i in possible_keywords:
            row = self.table.iloc[i]
            if row["关键字"] in sentence:
                result.append(row)
        return result

class KeywordFinder:
    def __init__(self, table, sentiment_aware):
        basic_table = self.build(table, None)
        self.tables = {
            "all": basic_table,
            "pos": basic_table,
            "neg": basic_table
        }
        if sentiment_aware:
            self.tables["pos"] = self.build(table, "情感倾向 != '负面'")
            self.tables["neg"] = self.build(table, "情感倾向 != '正面'")
    
    def build(self, table, query):
        if query is not None:
            table = table.query(query)
        index = {}
        for key, group in keywords_table.groupby("一级"):
            index[key] = KeywordIndex(group)
        return index
    
    def find(self, sentence, spaces, sentiment="all"):
        result = []
        table = self.tables[sentiment]
        for space in spaces:
            result.extend(table[space].find(sentence))
        return result

In [78]:
class Retagger:
    columns = ["sentiment", "cluster_id", "cluster_size", "center_sentence", "tag_1", "中心语断句", "关键字"]
    def __init__(self, keywords, topics, sentiment_aware=False):
        self.topics = topics
        self.keyword_finder = KeywordFinder(keywords, sentiment_aware)
        
    def __call__(self, df):
        result = []
        for _, row in df.iterrows():
            result.extend(self.retag_row(row))
        return pd.DataFrame(result)

    def retag_row(self, row):
        if row["一级"] in ["适用场景", "人群", "复购", '精神认同']:
            return []
        if row["处理级别"] <= 1:                   # 手动删除了一些尾巴的句子或者段落没有问题
            return [row]
        elif row["处理级别"] <= 3:                 # 一级分类内部有问题
            space = row["备注"].split(" ")
            return self.search(row, space)
        elif row["处理级别"] == 4:
            return self.command(row)
        else:
            return []

    def search(self, row, space):
        if "positive" in row["sentiment"]:
            sentiment = "pos"
        elif "negative" in row["sentiment"]:
            sentiment = "neg"
        else:
            sentiment = "all"
        cache = defaultdict(list)
        for sentence in row["去重段落"].split(","):
            found = self.keyword_finder.find(sentence, space, sentiment)
            for keyword_row in found:
                cache[(keyword_row["一级"], keyword_row["二级"])].append(sentence)
        result = []
        for (lv1, lv2), sentences in cache.items():
            result.append(pd.concat([
                row[self.columns],
                pd.Series({"一级": lv1, "二级": lv2, "去重段落": ",".join(sentences), "去重段落数": len(sentences)})
            ]))
        return result
    
    def command(self, row):
        cmd = row["备注"][:2]                     # 去除、保留、补充
        topic = row["备注"][2:]
        topics = topic.split(" ")
        if cmd == "去除":
            return self._do_remove(topics, row)
        if cmd == "保留":
            return self._do_filter(topics, row)
        if cmd == "补充":
            return self._do_add(topics, row)
        raise ValueError(cmd, topic, row)
        
    @staticmethod
    def _word_shows(sentence, words):
        return any(word in sentence for word in words)
    
    def _do_filter(self, topic, row):   # 保留 句子中含有指定词汇则去除这个句子
        sentences = row["去重段落"].split(",")
#         sentences = [sentence for sentence in sentences if self._word_shows(sentence, self.topics[topic])]
        sentences = [sentence for sentence in sentences if self._word_shows(sentence, topic)]
        row = row.copy()
        row["去重段落"] = ",".join(sentences)
        row["去重段落数"] = len(sentences)
        return [row]
    
    def _do_remove(self, topic, row):   # 去除 句子中含有指定词汇则去除这个句子
        sentences = row["去重段落"].split(",")
        sentences = [sentence for sentence in sentences if not self._word_shows(sentence, topic)]
        row = row.copy()
        row["去重段落"] = ",".join(sentences)
        row["去重段落数"] = len(sentences)
        return [row]
    
    def _do_add(self, topics, row):   # 补充 一级分类
        result = self.search(row, topics)
        result.insert(0, row)
        return result

In [79]:
topics = {
    "不像正品": ["不像正品"],
    "不透气": ["不透气"],
    "不磨脚": ["不磨脚"],
    "有差异有差别难看": ["有差异有差别难看"],
}   # 暂时没有用到这个功能
retagger = Retagger(keywords_table, topics)

In [104]:
normal_clusters = retagger(origin_cluster_table)

In [90]:
normal_clusters.to_excel("step_data/normal_cluster_cleaned.xlsx", index=False)

### 合并所有类的精调结果

In [113]:
special_clusters = pd.read_csv('step_data/part4_clean_special_clusters_all.csv') # 场景人群精神认同复购
special_clusters.columns = ['一级', '二级', '去重段落数', '去重段落', '情感倾向']

In [110]:
normal_clusters1 = normal_clusters[['一级', '二级', '去重段落数', '去重段落', 'sentiment']].sort_values(by=['一级','二级']).reset_index(drop=True)
normal_clusters1.columns = ['一级', '二级', '去重段落数', '去重段落', '情感倾向']

In [116]:
ttl_clusters = pd.concat([special_clusters, normal_clusters1])
ttl_clusters.to_csv(r"model_result/语料库/corpus_final.csv", index=False, encoding='utf-8')
ttl_clusters.to_excel(r"model_result/语料库/corpus_final.xlsx", index=False, encoding='utf-8')