In [16]:
import os
from collections import defaultdict
import itertools
import re
import pandas as pd
from langconv import Converter
from utils import sepSentence, get_source_comments_data
from my_sentiment import *

#### 给语料库里的句子生成一个字典，key是句子，values是这个句子的所有标签列表【（一级，二级，情感）, (一级，二级，情感)……】

In [2]:
def get_sentence_tags_dict(tags):
    """
        tags: 一级，二级，去重段落，情感倾向四列
    """
    tag_dict = {}
    for _, row in tags.dropna(subset=["一级", "二级"]).iterrows():
        for sentence in row["去重段落"].split(","):
            if sentence not in tag_dict:
                tag_dict[sentence] = []
            tag_dict[sentence].append((row["一级"], row["二级"], row["情感倾向"]))
    return tag_dict

### 1.给评论语句打标签

In [3]:
# 生成最终结果的表头
def get_tag_table(dataset, tag_dict):
    columns = []
    lv_tuples = set(itertools.chain(*(((lv1, lv2) for lv1, lv2, _ in lst) for lst in tag_dict.values())))   # 所有可能的一级二级标签
    for lv1, lv2 in lv_tuples:
        if lv2 == "/":
            lv2 = "其他"
        if lv1 in ['精神认同', '人群', '复购', '首购']:
            columns.append((lv1, lv2))
            continue
        for sentiment in ["(正面)", "(负面)"]:
            columns.append((lv1, lv2 + sentiment))
    columns = pd.MultiIndex.from_tuples(sorted(columns))
    tag_table = pd.DataFrame(0, columns=columns, index=dataset.index)
    return tag_table

In [5]:
def get_tags(dataset, tag_dict, tag_table):
    no_tag_comments = []
    no_tag_comments_count = 0
    tag_index = {item: i for i, item in enumerate(tag_table.columns)}
    for i, row in dataset.iterrows():
        sentences = sepSentence(row.loc[pd.IndexSlice["评论内容"]])
        this_tag = []
        is_tagged = None
        for sentence in sentences:
            if sentence not in tag_dict:
                continue
            for lv1, lv2, sentiment in tag_dict[sentence]:
                if lv1 not in ['精神认同', '人群', '复购', '首购']:
                    if lv2 == "/":
                        lv2 = "其他"
                    lv2 = lv2 + ("(正面)" if sentiment == "positive" else "(负面)")
                row = i
                col = tag_index[(lv1, lv2)]
                tag_table.values[row, col] = 1
            is_tagged = True
        if not is_tagged:
            no_tag_comments.append((row['品牌'], row['单品'], row.loc[pd.IndexSlice["评论内容"]]))
            no_tag_comments_count += 1
    print("没有标签评论数占比", no_tag_comments_count / len(dataset))
    # 没有被打上标签的
    pd.DataFrame(no_tag_comments).to_excel(f"model_result/评论标签结果/notagged_comments_result.xlsx", index=False, encoding='utf-8')
    return tag_table

In [6]:
# 保存被打上标签的结果
def save_tagged_result_by_brand(result, save_folder):
    for i, brand in enumerate(result['品牌'].unique()):
        print(i, brand)
        data = result.query('品牌 == @brand')
        if '/' in brand:
            brand = brand.split('/')[1]
        
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        
        save_folder_excel = os.path.join(save_folder,'excel版')
        if not os.path.exists(save_folder_excel):
            os.makedirs(save_folder_excel)

        data.to_excel(os.path.join(save_folder_excel, f"tagged_comments_{brand}_result.xlsx"), index=False, encoding='utf-8')
        data.to_hdf(os.path.join(save_folder, f"tagged_comments_result.h5"), key=f"brand{i}")

In [None]:
# 读取评论原始数据，非首次
dataset = pd.read_hdf('step_data/comments_dataset.h5', key='src')
# # 首次读取原始评论数据
# dataset = get_source_comments_data(is_first_time=False)
# dataset = dataset.reset_index(drop=True)
# dataset.to_hdf('step_data/comments_dataset.h5', key='src')
# print("原始评价数：", len(dataset))

In [None]:
# 语料库（带一级二级标签聚类段落）
tags = pd.read_excel(r"model_result/语料库/corpus_final_updated.xlsx")
tags = tags[tags['去重段落'].notnull()]
print(tags['情感倾向'].unique())

In [None]:
tag_dict = get_sentence_tags_dict(tags)
tag_table = get_tag_table(dataset, tag_dict)
tag_table = get_tags(dataset, tag_dict, tag_table)

In [None]:
index_columns = ['品牌', '单品', '链接', 'MSRP', '评论序号', '评论内容']
result = pd.concat([dataset[index_columns], tag_table], 1)    # 原始数据有‘男女’列，我的结果没有
tag_columns = set(result.columns) - set(index_columns)        # 所有标签列

In [None]:
save_tagged_result_by_brand(result, save_folder='model_result/评论标签结果')

In [None]:
# 燃数除去‘男女’列的标签覆盖率75%
rs_tag_columns = set(dataset.columns) - set(index_columns) - set('男女')
dataset['is_tagged'] = dataset[rs_tag_columns].sum(axis=1) >= 1
dataset['is_tagged'].sum()/len(dataset)

In [None]:
# 我的结果的覆盖率
result['is_tagged'] = result[tag_columns].sum(axis=1) >= 1
result['is_tagged'].sum() / len(result)

In [23]:
# 列名最终形式展示 以性能为例
a=[x for x in result.columns if '性能' in x[0]]
a

[('性能', '其他(正面)'),
 ('性能', '其他(负面)'),
 ('性能', '回弹(正面)'),
 ('性能', '回弹(负面)'),
 ('性能', '提速(正面)'),
 ('性能', '提速(负面)'),
 ('性能', '缓震(正面)'),
 ('性能', '缓震(负面)'),
 ('性能', '轻量(正面)'),
 ('性能', '轻量(负面)'),
 ('运动性能', '回弹(正面)'),
 ('运动性能', '回弹(负面)'),
 ('运动性能', '提速(正面)'),
 ('运动性能', '提速(负面)')]

In [26]:
result.columns

Index([               '品牌',                '单品',                '链接',
                    'MSRP',              '评论序号',              '评论内容',
            ('人群', '兄弟姐妹'),      ('人群', '其他长辈'),    ('人群', '女朋友/老婆'),
              ('人群', '奶奶'),
       ...
        ('适用场景', '跑跳(负面)'),  ('适用场景', '跳绳(正面)'),  ('适用场景', '跳绳(负面)'),
        ('适用场景', '跳舞(正面)'),  ('适用场景', '跳舞(负面)'), ('适用场景', '马拉松(正面)'),
       ('适用场景', '马拉松(负面)'),      ('首购', '其他首购'),      ('首购', '品牌首购'),
               'is_tagged'],
      dtype='object', length=145)