In [1]:
import pandas as pd
import numpy as np
import jieba
import re
import itertools
from collections import defaultdict

In [2]:
def read_list(path):
    words = [line.strip() for line in open(path, encoding='UTF-8').readlines()]
    words = np.unique(words, axis=0) # 删除重复数据
    return words

In [41]:
stopwords_path = r'model_input\keywords\cn_stopwords.txt'
stopwords = read_list(stopwords_path)
remove = np.array(['好','不','不是','可','如果','不知','有','不如','一般','再','可以','还要','要','便于','也','又','还','比','和'])
stopwords = np.setdiff1d(stopwords,remove)

In [42]:
keyword_input_file = r'model_input\keywords\keywords_sentimental_words.xlsx'
cluster_input_file = r'场景人群代言复购语料库/场景人群复购精神认同语料库4.xlsx'

# 一、精神认同

### 1.1 明星代言

In [47]:
def clean_endorsement(df, column_name='段落'):
    
    # 自定义明星代言去除词汇
    dropwords = ['双十一','视频','二代','硝化细菌','正品店铺','态度好','广场舞','跑步','耐心',
       '发微信','独具慧眼','笔名病魔改名','良心商家','值得回购','淘宝首页','中底科技革命','纳爱斯集团']
    defined_words = ['直播间','佳琦']                 # 李佳琦直播的特殊处理
    special_words = dropwords + defined_words        # 所有特殊词汇
    useless = []                                     # 其他
    lijiaqi = []                                     # 李佳琦直播间相关语句   
    endorsement_result = []                          # 明星代言清理结果
    
    
    def endorsement_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个明星代言类
        """
        endorsements = []                            # 常规明星代言相关语句
        sentences_list = row.split(',')
        for sentence in sentences_list:
            if all(sentence.find(word)== -1 for word in special_words):
                endorsements.append(sentence)
            elif (sentence.find('代言') >= 0) | (sentence.find('粉丝') >= 0):
                endorsements.append(sentence)
            elif all(sentence.find(word) == -1 for word in defined_words):
                useless.append(sentence)                                  
            else:
                lijiaqi.append(sentence)

        return ['精神认同', '明星代言', len(endorsements), ",".join(endorsements)]
    

    for row in df[column_name]:
        cleaned_rows = endorsement_parser(row)
        endorsement_result.append(cleaned_rows)
    endorsement_result.append(['精神认同', '李佳琦', len(lijiaqi), ",".join(lijiaqi)])
    endorsement_result.append(['精神认同', 'useless', len(useless), ",".join(useless)])    
    return pd.DataFrame(endorsement_result, columns=["tag1", 'tag2', '评论数', '段落'])

In [48]:
# 明星代言处理
data = pd.read_excel(cluster_input_file, sheet_name='精神认同')
endorsement = data[data['tag_2'] == '明星代言']
endorsement_result = clean_endorsement(endorsement, column_name='段落')

In [49]:
endorsement_result

Unnamed: 0,tag1,tag2,评论数,段落
0,精神认同,明星代言,465,"因为黄景瑜代言fila果断下单, 耐克宣称王一博为官方合作伙伴, 跟宋亚轩穿同款好开心, 小..."
1,精神认同,明星代言,604,"代言, 时代少年团代言了, zici时代少年团的代言, 自从俊代言以后, 这次宣了俊代言, ..."
2,精神认同,明星代言,549,"主持龚俊, 时代少年团宋亚轩, 永远期待宋亚轩, 永远支持宋亚轩, 也会一直支持龚俊, 本来..."
3,精神认同,明星代言,611,"听到消息代言真的是又激动又开心, 和代言人一起加油哦, 顺便说一下代言人真的很帅哈, 和代言..."
4,精神认同,明星代言,583,"博君一肖百香果为王一博, 龚俊, 博君一肖为王一博而来, 顺便说一句我爱龚俊, 博君一肖百香..."
5,精神认同,明星代言,569,"支持王一博代言颜色真滴好看, 小飞侠支持肖战代言鞋子超好看, 我家哥很喜欢百香果支持新代言人..."
6,精神认同,明星代言,0,
7,精神认同,明星代言,519,"肖战代言以后买了几件李宁产品,王一博代言安踏后线上线下买了不少安踏的产品,因为肖战代言开始入..."
8,精神认同,李佳琦,138,"佳琦直播间优惠力度太大了, 之前李佳琦直播间609入了一双小码数的, 佳琦直播间下单, 佳琦..."
9,精神认同,useless,4,"因为态极科技的成功引发了国内运动品牌全行业的中底科技革命,因为态极科技的成功引发了国内运动..."


### 1.2 国货

In [50]:
def clean_domestic(df, column_name='段落'):
    
    # 自定义国货搜索词
    brand = ['anta','do-win','erke','lining','peak','xtep','安踏','匹克','鸿星尔克','李宁','多威','特步','361','乔丹','dowin',
       '国货','国牌','国产','国内品牌','国民品牌','国有品牌','我国的品牌','国潮','国内的','中国','我们国家','自己国家',
       '自主品牌','民族品牌','国鞋','本土品牌','自己的品牌','国内制造','国家的品牌','国家产品','尔克','国乔','中乔','奇弹','太极','态极']
    no_words = ['差','不敢恭维','不是','武汉加油']  # 否定词语
    domestic_result = []                          # 国货清理结果
    useless = []                                  # 其他
    
    
    def domestic_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个国货类
        """
        domestics = []                            # 常规国货相关语句
        sentences_list = row.split(',')
        for sentence in sentences_list:
            if any(sentence.lower().find(word) >= 0 for word in brand):
                if any(sentence.lower().find(word) >= 0 for word in no_words):
                    useless.append(sentence)
                else:
                    domestics.append(sentence)
            else:
                useless.append(sentence)
        
        return ['精神认同', '国货', len(domestics), ",".join(domestics)]
    
    
    for row in df[column_name]:
        cleaned_rows = domestic_parser(row)
        domestic_result.append(cleaned_rows)
    domestic_result.append(['精神认同', 'useless', len(useless), ",".join(useless)])
    
    return pd.DataFrame(domestic_result, columns=["tag1", 'tag2', '评论数', '段落'])

In [51]:
# 国货处理
data = pd.read_excel(cluster_input_file, sheet_name='精神认同')
domestic = data[data['tag_2'] == '国货']
domestic_result = clean_domestic(domestic, column_name='段落')

In [52]:
domestic_result

Unnamed: 0,tag1,tag2,评论数,段落
0,精神认同,国货,382,"国货没毛病,国产无敌和30万的丰田对比30万的奔驰一个道理,支持国货结果国货一点不给力,支持..."
1,精神认同,国货,378,"祝一博和安踏都越来越好,想信特步越来越好,摩托姐祝安踏越来越好,特步加油越来越好,以后都会首..."
2,精神认同,国货,822,"国产品牌确实越做越好了,国产品牌真的是越做越好了,国产品牌越做越好了,国产品牌真的越做越好了..."
3,精神认同,国货,137,"希望鸿星尔克越做越好支持国货,国牌会越来越好的,希望奇弹下一代能做的更好,最后希望国货能做得..."
4,精神认同,useless,1327,"比同等价位品牌都好,有对比才有差距,首先盒子就非常的高级,没想到升级款全方位不如,当年县城还..."


# 二、人群

In [53]:
def clean_people(df, key_dict, column_name='段落'):
    
    # 自定义人群去除词
    dropwords = ['老爹鞋','门卫大叔','麻烦','快递小哥','拿快递的小哥','办公室','公子','公主','公举','公仔','脑公','公司']
    
    # 建立结果字典
    useless = []
    people_result = defaultdict(list)
    sentence_num = []
    
    def people_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个人群类
        """
        sentences_list = row.split(',')
        for sentence in sentences_list:
            keynum = 0                         # 标记是否是人群相关的语句
            for k,v in key_dict.items():
                if any(sentence.find(word) >= 0 for word in v):
                    # 特殊情况一：其他长辈（公、叔、老公、外公、相公）
                    if k == '其他长辈':
                        # 有dropwords，则放入无用语料
                        if any(sentence.find(word) >= 0 for word in dropwords):
                            useless.append(sentence)
                        # 有老公、外公、相公，则跳到下一个key
                        elif any(sentence.find(word) >= 0 for word in ['老公','外公','相公']):
                            continue
                        # 其余放入其他长辈list
                        else:
                            keynum += 1
                            people_result[k].append(sentence)
                    
                    # 特殊情况二：母亲（麻、爸妈、爹妈）
                    elif k == '母亲':
                        # 有dropwords，则放入无用语料
                        if (sentence.find('麻烦') >= 0) & (sentence.find('麻烦') == sentence.find('麻')):
                            useless.append(sentence)
                        # 有爸妈、爹妈，则跳到下一个key
                        elif any(sentence.find(word) >= 0 for word in ['爸妈','爹妈']):
                            continue
                        # 其余放入母亲list
                        else:
                            keynum += 1
                            people_result[k].append(sentence)
                    
                    # 特殊情况三：父亲（爸妈、爹妈、老爹鞋）
                    elif k == '父亲':
                        # 有dropwords，则放入无用语料
                        if (sentence.find('老爹鞋') >= 0) & (sentence.find('老爹鞋') == sentence.find('老爹')):
                            useless.append(sentence)
                        # 有爸妈、爹妈，则跳到下一个key
                        elif any(sentence.find(word) >= 0 for word in ['爸妈','爹妈']):
                            continue
                        # 其余放入父亲list
                        else:
                            keynum += 1
                            people_result[k].append(sentence)
                    
                    # 特殊情况四：婆婆（老婆、外婆）
                    elif k == '婆婆':
                        # 有老婆、外婆，则跳到下一个key
                        if any(sentence.find(word) >= 0 for word in ['老婆','外婆']):
                            continue
                        # 其余放入婆婆list
                        else:
                            keynum += 1
                            people_result[k].append(sentence)
                    
                    # 特殊情况五：爷爷（姥爷、老爷子）
                    elif k == '爷爷':
                        # 有姥爷、老爷子，则跳到下一个key
                        if any(sentence.find(word) >= 0 for word in ['姥爷','老爷子']):
                            continue
                        # 其余放入爷爷list
                        else:
                            keynum += 1
                            people_result[k].append(sentence)
                    
                    # 没有特殊情况
                    else:
                        keynum += 1
                        people_result[k].append(sentence)
                else:
                    continue
            if keynum == 0:
                useless.append(sentence)
        return people_result
    
    for row in df[column_name]:
        people_result = people_parser(row)
    
    people_result['useless'] = useless
    
    final_result = []
    for tag2, tag2_sentences in people_result.items():
        final_result.append(['人群', tag2, len(tag2_sentences), ','.join(tag2_sentences)])
    return pd.DataFrame(final_result, columns=['tag1', 'tag2', '评论数', '段落'])

In [54]:
# 读取人群关键词列表
tag = pd.read_excel(keyword_input_file, sheet_name='人群')
tag_keyword = tag.groupby('tag_2').apply(lambda x: list(itertools.chain(x['keyword'].tolist())))
people_tag_dict = dict(tag_keyword)
# 人群处理
data = pd.read_excel(cluster_input_file, sheet_name='人群')
people = data[data['tag_1'] == '人群']
people_result = clean_people(people, people_tag_dict, column_name='段落')

In [55]:
people_result.head()

Unnamed: 0,tag1,tag2,评论数,段落
0,人群,父亲,2350,"老爸穿着很合适,爸穿着很合适,老爸穿得很合适,老爸穿着也很合适,老爸穿上很合适,老爸穿很合适..."
1,人群,母亲,1444,"老妈穿着很合适,妈穿着很合适,妈穿着很合脚,老妈穿着挺合适,老妈穿上很好看,老妈穿上很合适,..."
2,人群,对象,126,"老伴穿着很合适,爱人穿着上班很合适,对象穿着挺舒服的,对象穿着很舒服,我家那位穿上很好看,对..."
3,人群,男朋友/老公,2535,"老公穿着很合适,老公穿着很合脚,老公穿得很合适,老公穿着挺合适,老公说他穿着很合适,我老公穿..."
4,人群,兄弟姐妹,933,"我弟穿着很合适,弟穿着很合适,大哥穿上很合适,弟穿的非常合适,弟穿上很合适,弟穿很合适,弟说..."


### 三、复购

#### 3.1 更新换代复购

In [56]:
def clean_update(df, column_name='段落'):
    
    # 自定义更新换代搜索词
    update_search = ['代','旧款','旧版','升级','相对以前']
    dropwords = ['希望','替代','代替','代步']  
    update_result = []                            # 更新换代清理结果
    useless = []                                  # 其他
    
    
    def update_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个更新换代复购类
        """
        updates = []                              # 常规更新换代相关语句
        sentences_list = row.split(',')
        for sentence in sentences_list:
            if any(sentence.find(word) >= 0 for word in update_search):
                if any(sentence.find(word) >= 0 for word in dropwords):
                    useless.append(sentence)
                else:
                    updates.append(sentence)
            else:
                useless.append(sentence)
        
        return ['复购', '更新换代复购', len(updates), ",".join(updates)]
    
    
    for row in df[column_name]:
        cleaned_rows = update_parser(row)
        update_result.append(cleaned_rows)
    update_result.append(['复购', 'useless', len(useless), ",".join(useless)])
    
    return pd.DataFrame(update_result, columns=["tag1", 'tag2', '评论数', '段落'])

#### 3.2 品牌复购

In [57]:
def clean_brand_rebuy(df, brand_list, column_name='段落'):
    
    # 自定义品牌复购搜索词
    add_keys=['品牌','牌子','产品','系列']
    special_words = brand_list + add_keys         # 所有特殊词汇
    rebuy = ['好几','好多','一如既往','一直','不是第一次','没得说','没的说','一如继往', '向来', '习惯穿', '长期穿']
    like = ['喜欢','值得','信得过','有保障','赞赏','认可','不愧','不错']
    dislike = ['失望']
    oppsite_begin_words = ['没','不','未','少']

    brand_result = []                        # 更新换代清理结果
    useless = []                             # 其他
    
    brand_rebuy = []                         # 常规品牌复购相关语句
    brand_like = []                          # 品牌喜爱
    brand_dislike = []                       # 品牌不喜爱    
    
    
    def brand_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个品牌复购类
        """
        sentences_list = row.split(',')
        for sentence in sentences_list:
            if any(sentence.lower().find(word) >= 0 for word in special_words):
                # 品牌复购
                if any(sentence.find(word) >= 0 for word in rebuy) | (re.search(r'([穿买].+年)|(.+年.+[穿买])|(买.+[次双])|([^1一][双次])', sentence) != None):
                    brand_rebuy.append(sentence)
                # 存在品牌喜爱的词且没有否定词
                elif any(sentence.find(word) >= 0 for word in like) & all(sentence.find(word) < 0 for word in oppsite_begin_words):
                    brand_like.append(sentence)
                # 品牌从没失望
                elif any(sentence.find(word) >= 0 for word in dislike) & any(sentence.find(word) >= 0 for word in oppsite_begin_words):
                    brand_rebuy.append(sentence)
                else:
                    useless.append(sentence)
            else:
                useless.append(sentence)
  
    for row in df[column_name]:
        brand_parser(row)
    brand_result.append(['复购', '品牌复购', len(brand_rebuy), ",".join(brand_rebuy)])
    brand_result.append(['精神认同', '品牌热爱', len(brand_like), ",".join(brand_like)])  
    brand_result.append(['复购', 'useless', len(useless), ",".join(useless)])
    
    return pd.DataFrame(brand_result, columns=["tag1", 'tag2', '评论数', '段落'])

#### 3.3 常规复购

In [58]:
# 自定义搜索词及jieba
neu_begin = ['会','考虑','贵','还会','再来','值得','直得','下次','下一次','以后','以后再','以后在','还','还来','会再',
          '值得再次','再','值得无限','有需要','有机会','有活动','还会再次','会在','还会再','还会在','会一直','打算再','打算在',
          '会来','想','还想','要','想再','想在','应该会','还要','下次再来','可以一直','准备','下次定','下次一定','下次优惠',
          '下次再','下次在','会持续','便于','可以','如果','会终身','会反复','打算','期待','必须','会再次','还会有','还会进行',
          '有待','继','回来','必','回再','也要','就','说要','回头再','回头在','推荐','建议','值的']
opp_begin = ['不','不会','不会再','不会在','不愿意','再不','在不','不会再来','再也不','不是','不敢再','不敢','绝对不','本想再',
           '本想','不再','不在','不想','不太会','永不','再也不会','不太想再','不可能','再也没','没有']
# will_search=['活动','准备','冲动','机会','下次','等有货']
will_rebuy_words = ['需要','如果','有合适的','以后','合适时机','下次','会回购','值得','有活动','想法','意愿','理由','打算',
             '合适价格','合适价','会再来','继续支持','会','准备','再购一双','需时','想','过几天','回购清单','指的',
             '关注','之后','期待换季','期待','要万年回购','好的话','有好价','冲动','考虑','合适的话','继续买']
no_rebuy_words = ['不要','再也不','下次不买','再没买','再不买','没有再买','以后不','以后都不','不会再','没有再']
first_buy_words = ['第一次买','第一双','第一款','头一次','首次','第一回','第一次入手','第一次穿','第一次购买']   # 首购

In [59]:
class RebuyCleaner:
    def __init__(self, first_buy_words, brand_list, rebuy_words, neu_begin_list, opp_begin_list, no_rebuy_words, will_rebuy_words):
        self.first_buy_words = first_buy_words
        self.brand_list = brand_list
        self.rebuy_words = rebuy_words
        self.neu_begin_list = neu_begin_list
        self.opp_begin_list = opp_begin_list
        self.no_rebuy_words = no_rebuy_words
        self.will_rebuy_words = will_rebuy_words
        self.useless = []
        
        addwords = first_buy_words + brand_list + rebuy_words + neu_begin_list + opp_begin_list + no_rebuy_words + will_rebuy_words
        # 添加jieba自定义分词
        for word in addwords:
            jieba.add_word(word)    

    def load_data(self, df, parse_column='段落'):
        self.df = df
        self.col_name = parse_column
    
    def parse_cluster(self):
        rebuy_result = []
        for row in self.df[self.col_name]:
            self.parse_row(row)   
            rebuy_result.append(['复购', '已经复购', len(self.have), ",".join(self.have)])
            rebuy_result.append(['复购', '可能复购', len(self.will), ",".join(self.will)])
            rebuy_result.append(['复购', '不会复购', len(self.wont), ",".join(self.wont)])
            rebuy_result.append(['首购', '品牌首购', len(self.first_brand), ",".join(self.first_brand)])
            rebuy_result.append(['首购', '其他首购', len(self.first), ",".join(self.first)])
        rebuy_result.append(['复购', 'useless', len(self.useless), ",".join(self.useless)])
        return pd.DataFrame(rebuy_result, columns=['tag1', 'tag2', '评论数', '段落'])
    
    def parse_row(self, row):
        self.have = []             # 已经回购
        self.will = []             # 会回购
        self.wont = []             # 不会回购
        self.first_brand = []      # 品牌首购
        self.first = []            # 首购
        self.check = []            # 用来检查的废语料，不相关语料
        
        sentences_list = row.split(',')
        for sentence in sentences_list:
            self.parse_sentence(sentence)
         
    def parse_sentence(self, sentence):
        self.sentence = sentence
        self.word_list = [x for x in jieba.cut(self.sentence) if x not in stopwords]
        self.flag = 0  
        
        if (re.search(r'购|买|来|光顾|选择|光临|次|失望|支持|关注|入|继续|下单|双|订|安排|推荐|有需要|牌|这家|店|屯|囤|收|体验|尝试|下手',
                      self.sentence) != None) | any(self.sentence.lower().find(word) >= 0 for word in self.brand_list):        
            self.is_first_rebuy()
            for func in ['self.is_wont_rebuy()', 'self.is_will_rebuy()', 'self.final_rebuy_search()']:
                    if self.flag == 0:
                        exec(func)
                    else:
                        break
        else:
            self.useless.append(self.sentence)
    
    # 判断特殊品牌首购和其他首购
    def is_first_rebuy(self):
        for first_word in self.first_buy_words:
            if first_word in self.word_list:
                if first_word == '第一次穿':             # 第一次穿某个品牌则append，第一次穿去跑步等放入useless
                    if any(self.sentence.lower().find(word) >= 0 for word in self.brand_list):
                        self.first_brand.append(self.sentence)
                else:    # 其他特殊首购情况
                    index = self.word_list.index(first_word)
                    # 不属于首购
                    if (index > 0) & (self.word_list[index-1] in ['没','没有','不是','要','比','和']):   # 不是第一次买
                        self.have.append(self.sentence)
                    # 属于首购
                    else:
                        # 品牌首购
                        if any(self.sentence.lower().find(word) >= 0 for word in self.brand_list):
                            self.first_brand.append(self.sentence)
                        # 其他首购
                        else:
                            self.first.append(self.sentence)
                break
    
    # 判断特殊不会复购，会存在可能复购的情况
    def is_wont_rebuy(self):
        if any(self.sentence.find(word) >= 0 for word in self.no_rebuy_words):
#             if self.sentence.find('还') >= 0:
#                 self.will.append(self.sentence)
#             else:
#                 self.wont.append(self.sentence)
            self.wont.append(self.sentence)
            self.flag = 1
    
    # 判断特殊可能复购，会存在不会复购等情况
    def is_will_rebuy(self):
        is_opposite = 0
        if any(self.sentence.find(word) >= 0 for word in self.will_rebuy_words):
            # 出现已经复购
            if any(self.sentence.find(word) >= 0 for word in ['都会回购']):    # 有需要都会回购，说明已复购
                self.have.append(self.sentence)
                self.flag = 1
            # 一定是可能复购
            elif re.search(r'还|如果|的话', self.sentence) != None:                      #还会来
#             self.sentence.find('还') >= 0:                       
                self.will.append(self.sentence)
                self.flag = 1
            # 出现无用语句
            elif self.sentence.find('不会后悔') >= 0:
                self.useless.append(self.sentence)
                self.flag = 1
            # 在不满足上述条件的情况下判断是否属于不会复购
            else: 
#                 for word in self.word_list:
                if (sum([1 if word in self.opp_begin_list else 0 for word in self.word_list ]) > 0) | any(self.sentence.find(word) >= 0 for word in self.no_rebuy_words):
                    if '不会失望' not in self.sentence:                 # 不会失望
                        self.wont.append(self.sentence)
                        self.flag = 1
                    is_opposite = 1  
                if is_opposite == 0:
                    self.will.append(self.sentence)
                    self.flag = 1
                                        
    # 常规情况整体搜索
    def final_rebuy_search(self):
        for index, word in enumerate(self.word_list):
            if word in self.rebuy_words:
                self.flag = 1
                if word in ['买了一双','买一双','一双', '买了一件','买了双']:
                    if any(self.sentence.find(word) >= 0 for word in ['又','也','回购','各','再','再买', '也是','还','和','次','之前']):
                        self.have.append(self.sentence)
                    else:
                        self.useless.append(self.sentence)
                # 根据关键词的前一个词判断属于可能复购/不会复购/已经复购
                elif (self.word_list[index-1] in self.neu_begin_list) & (index > 0):
                    self.will.append(self.sentence)
                elif (self.word_list[index-1] in self.opp_begin_list) & (index > 0):
                    if '不会失望' not in self.sentence:
                        self.wont.append(self.sentence)
                else:
                    self.have.append(self.sentence)
                break
        if re.search(r'又[帮|给].+买', self.sentence) != None:
            self.have.append(self.sentence)
            self.flag = 1
        elif (re.search(r'一直.+[穿|买]', self.sentence) != None) & any(self.sentence.find(word) >= 0 for word in (self.brand_list + ['品牌','牌子'])):
            self.have.append(self.sentence)
            self.flag = 1
        else:
            pass
        if (self.flag == 0) & (self.sentence not in (self.first + self.first_brand)):
            self.useless.append(self.sentence)

In [60]:
data = pd.read_excel(cluster_input_file, sheet_name='首购复购')
# 更新换代复购处理
update = data[data['tag_2'] == '产品']
update_result = clean_update(update, column_name='段落')

In [61]:
update_result.head()

Unnamed: 0,tag1,tag2,评论数,段落
0,复购,更新换代复购,195,"没有上次14代的那个设计合理,感觉这一代不如上一代用料足啊,感觉没有上一代用心了,这一代实物..."
1,复购,useless,674,"这个880替代真的不错,没有之前的那种高科技盒子,但相比alphafly那种后掌没有橡胶还是..."


In [62]:
# 读取品牌列表
data_brand = pd.read_excel(keyword_input_file, sheet_name='品牌')
brand = data_brand['brand'].astype(str).str.lower().tolist()
# 处理品牌复购
brand_data = data[data['tag_2'] == '品牌']
brand_result = clean_brand_rebuy(brand_data, brand_list=brand, column_name='段落',)

In [63]:
brand_result.head()

Unnamed: 0,tag1,tag2,评论数,段落
0,复购,品牌复购,1010,"迪卡侬的鞋没失望过,迪卡侬的鞋没的说,迪卡侬没失望过,迪卡侬的鞋子没的说,乔丹的鞋子没让我失..."
1,复购,品牌热爱,1300,"就喜欢斯凯奇的鞋底,太喜欢买的斯凯奇旅游鞋,就喜欢斯凯奇家的鞋,就喜欢斯凯奇的鞋子,就喜欢斯..."
2,复购,useless,2950,"对迪卡侬这个鞋失望极了,斯凯奇的鞋子没问题,安踏的鞋子原来越失望了,斯凯奇的鞋真心好穿,斯凯..."


In [64]:
# 不会复购，首购，可能回购，已经回购
# 读取复购关键词
rebuy_keywords = pd.read_excel(keyword_input_file, sheet_name='复购')
rebuy_keywords = rebuy_keywords['keyword'].tolist()
brand.append('碳板')

rebuy_clusters = data[data['tag_2'] == '复购']
clean_rebuy = RebuyCleaner(first_buy_words, brand, rebuy_keywords, neu_begin, opp_begin, no_rebuy_words, will_rebuy_words)
clean_rebuy.load_data(rebuy_clusters, parse_column='段落')
result = clean_rebuy.parse_cluster()

In [65]:
# 保存所有复购类的清理结果
rebuy_result = pd.concat([update_result, brand_result, result])
rebuy_result = rebuy_result.query("评论数 > 0")

In [66]:
rebuy_result.head()

Unnamed: 0,tag1,tag2,评论数,段落
0,复购,更新换代复购,195,"没有上次14代的那个设计合理,感觉这一代不如上一代用料足啊,感觉没有上一代用心了,这一代实物..."
1,复购,useless,674,"这个880替代真的不错,没有之前的那种高科技盒子,但相比alphafly那种后掌没有橡胶还是..."
0,复购,品牌复购,1010,"迪卡侬的鞋没失望过,迪卡侬的鞋没的说,迪卡侬没失望过,迪卡侬的鞋子没的说,乔丹的鞋子没让我失..."
1,复购,品牌热爱,1300,"就喜欢斯凯奇的鞋底,太喜欢买的斯凯奇旅游鞋,就喜欢斯凯奇家的鞋,就喜欢斯凯奇的鞋子,就喜欢斯..."
2,复购,useless,2950,"对迪卡侬这个鞋失望极了,斯凯奇的鞋子没问题,安踏的鞋子原来越失望了,斯凯奇的鞋真心好穿,斯凯..."


In [198]:
rebuy_result.to_excel("复购全部.xlsx")

### 四、场景

In [95]:
def clean_situation(df, key_dict, column_name='段落'):
    
    # 自定义否定词
    none = ['不合适','不适合','不能','不太','无法','没法','还是算了','就算了','不推荐','不能作为','不能拿来','免了','别说了','别想了','差',
      '更别说','千万别','别说','不舒服','慎重','疼','痛','累','烧','磨','硌','不是太好','重','硬','受伤','不好','不太能','挤','沉','顿',
      '自求多福','别考虑','三思','捂','顶脚','崴','压脚','夹脚','臭','不透气','难受','不舒适','勒','不是很合适','不太行','闷','不行',
      '更不要说','不太好','不要','不跟脚','磕','别扭','咯吱','嘎吱','滑','不是很好','不建议','不适宜','不够','不足','不是很舒服','不稳定',
      '不怎么舒服','泡','失望','问题','不用来','不指望','不是','不可以','不大行','不大舒服','不方便','有点影响','不是很方便','不好控制','没有提升',
      '不适宜']
    neg_sentiment = ['疼','痛','累','烧','磨','硌','重','硬','受伤','挤','沉','顿','捂','顶脚','崴','压脚','夹脚','臭','难受','勒','闷',
               '磕','别扭','咯吱','嘎吱','滑','泡','失望','问题','差']  # 前面出现否定词会变成正面的
    
    # 建立结果字典
    useless = []                                   # 其他
    situation_positive = defaultdict(list)         # 正面
    situation_negative = defaultdict(list)         # 负面
    sentence_num_pos = []
    sentence_num_neg = []
    
    def refresh_keyword(k, keynum, sentence, key, dropword, searchword):
        flag = 0
        if k == key:
            if (sentence.find(dropword) >= 0) & (sentence.find(searchword) == sentence.find(dropword)): # 如果是街舞里搜到街等于没有搜到
                keynum -= 1
                flag += 1
        return flag

    def divide_sentiment(k, forsearch_sentence):
        # 判断否定
        if any(forsearch_sentence.find(word) >= 0 for word in none):
            # 特殊情况1：不太、不是
            if any(forsearch_sentence.find(word) >= 0 for word in ['不太','不是']):
                if any(forsearch_sentence.find(word) >= 0 for word in ['适合','合适','建议','推荐','舒服','不太行','不太能','不太好','不是专业']):
                    situation_negative[k].append(forsearch_sentence)
                else:
                    situation_positive[k].append(forsearch_sentence)
                            
            # 特殊情况2：差
            elif any(forsearch_sentence.find(word) >= 0 for word in ['出差','差不多']): # 没有反面的意思
                situation_positive[k].append(forsearch_sentence)
            
            # 特殊情况3：负面情感词
            elif any(forsearch_sentence.find(word) >= 0 for word in neg_sentiment):  # neg_sentiment里的词加上前置否定又变成了正面的
                # 3-1 出现否定词，则 positive append
                if any(forsearch_sentence.find(word) >= 0 for word in ['不','没','无','防','耐']):
                    situation_positive[k].append(forsearch_sentence)
                # 3-2 重
                elif any(forsearch_sentence.find(word) >= 0 for word in ['重新','重力','体重','重量','负重','重庆']):
                    situation_positive[k].append(forsearch_sentence)
                # 3-3 滑
                elif any(forsearch_sentence.find(word) >= 0 for word in ['滑板','滑雪']):
                    situation_positive[k].append(forsearch_sentence)
                # 3-4 其他均 negative append
                else:
                    situation_negative[k].append(forsearch_sentence)
            # 其余搜索到none的情况
            else:
                situation_negative[k].append(forsearch_sentence)
        # 没有否定
        else:
            situation_positive[k].append(forsearch_sentence)
    
    
    def situation_parser(row):
        """
            row:一个类里所有语句
            return:符合需求的一个适用场景类并区分正负
        """
        sentences_list = row.split(',')
        for sentence in sentences_list:
            keynum = 0    # 一句话里tag2的个数
            # 根据关键词判断
            for k,v in key_dict.items():
                if any(sentence.find(word) >= 0 for word in v):
                    
                    keynum += 1
                    
                    # 特殊情况一：搭配
                    if k == '搭配':
                        if sentence.find('不好搭配') >= 0:
                            situation_negative[k].append(sentence)
                        elif (sentence.find('难搭') >= 0) & (sentence.find('不') == -1):
                            situation_negative[k].append(sentence)
                        else:
                            situation_positive[k].append(sentence)
                    
                    # 特殊情况二：日常休闲
                    elif k == '日常休闲':
                        flag = refresh_keyword(k, keynum, sentence, '日常休闲', '街舞', '街')
                        if flag == 1:
                            continue
                        if any(sentence.find(word) >= 0 for word in ['不太能','不合适','不是很合适','不适合','不是很适合']):
                            situation_negative[k].append(sentence)
                        else:
                            situation_positive[k].append(sentence)
                    
                    # 特殊情况三：在跑步中搜到跑步鞋，则跳过
                    elif (k == '跑步') & (sentence.find('跑步鞋') >= 0) & (sentence.find('跑步') == sentence.find('跑步鞋')):
                        continue
                    
                    # 其他情况
                    else:
                        # 运动鞋
                        flag_1 = refresh_keyword(k, keynum, sentence, '其他运动', '运动鞋', '运动')
                        # 健步如飞
                        flag_2 = refresh_keyword(k, keynum, sentence, '健步', '健步如飞', '健步')
                         # 体育用品
                        flag_3 = refresh_keyword(k, keynum, sentence, '体育课及考试', '体育用品', '体育')
                        # 其余常规
                        if max(flag_1,flag_2,flag_3) == 0:
                            divide_sentiment(k, sentence)
                else: # 没有搜索到关键词
                    continue             
            
            # 根据正则表达式判断
            # 1-跳X舞
            if (re.search(r'跳.+舞', sentence) != None) & (sentence.find('跳广场舞') == -1):
                keynum += 1
                divide_sentiment('跳舞', sentence)
            # 2-平时穿
            if any(sentence.find(word) >= 0 for word in ['平时穿','平常穿']):
                keynum += 1
                if any(sentence.find(word) >= 0 for word in ['不太能','不合适','不是很合适','不适合','不是很适合']):
                    situation_negative['日常休闲'].append(sentence)
                else:
                    situation_positive['日常休闲'].append(sentence)
            
            if keynum == 0:      
                useless.append(sentence)
        return situation_positive, situation_negative
    
    # 逐行清理
    for row in df[column_name]:
        situation_pos_result, situation_neg_result = situation_parser(row)
    
    # 保存清理完的结果
    final_result = []
    for tag2, tag2_sentences in situation_pos_result.items():
        final_result.append(('适用场景', tag2, 'positive', len(tag2_sentences), ",".join(tag2_sentences)))
    
    for tag2, tag2_sentences in situation_neg_result.items():
        final_result.append(('适用场景', tag2, 'negative', len(tag2_sentences), ",".join(tag2_sentences)))
    
    final_result.append(('适用场景', "useless", None, len(useless), ",".join(useless)))
    
    final_result = pd.DataFrame(final_result, columns=['tag1', 'tag2', '情感倾向', '评论数', '段落'])
    return final_result

In [68]:
data = pd.read_excel(cluster_input_file, sheet_name='适用场景')
data = data[data['tag_1'] == '适用场景']

In [69]:
# 读取适用场景列表
tag = pd.read_excel(keyword_input_file, sheet_name='场景')
tag_keyword=tag.groupby('tag_2').apply(lambda x: list(itertools.chain(x['keyword'].tolist())))
tag_keyword = dict(tag_keyword)

In [70]:
situation_result = clean_situation(data, tag_keyword, column_name='段落')

In [225]:
situation_result.to_excel("场景清理.xlsx")

In [71]:
situation_result.head()

Unnamed: 0,tag1,tag2,情感倾向,评论数,段落
0,适用场景,跑步,正面,27881,"跑步起来脚感很棒,跑起步来脚感很好,跑步脚感很棒,跑起来脚感超棒,跑步脚感非常棒,跑起来脚感..."
1,适用场景,走路,正面,6575,"走路脚感也很棒,走路脚感很棒,走路的脚感也非常棒,走路跑步脚感不错,走路脚感也很好,跑步走路..."
2,适用场景,跳绳,正面,1059,"跳绳跑步轻盈感挺好,跑步跳绳都特别轻便,跑步跳绳都很轻便,跑步跳绳刚好而且很轻,跑步走路跳绳..."
3,适用场景,其他运动,正面,3047,"运动起来很跟脚,跑步运动很轻便,运动起来很舒适,跑步锻炼时感觉很有力,跑步的时候略微的弹跳力..."
4,适用场景,健身,正面,674,"跑步的时候略微的弹跳力很适合健身运动,健身休闲百搭,感觉挺适合跑步健身的,跑步健身挺舒服的,..."


In [88]:
# 合并所有类
all_result = pd.concat([endorsement_result, domestic_result, people_result, rebuy_result, situation_result])
all_result['is_useless'] = all_result['tag2'].str.contains('useless')
all_result = all_result[all_result['is_useless'] == False]
all_result = all_result.drop(['is_useless'], axis=1)
all_result.to_csv('step_data/part4_clean_special_clusters_all.csv', index=False)