In [144]:
import os
import json
import itertools
import re
from collections import defaultdict, Counter

import jieba
import jieba.analyse as analyse

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=16)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [169]:
# 中文停用词列表
chn_stop_words = set()

for file in os.listdir('/data/daveting/other_project/explore/tfidf/stopwords'):
    if '.txt' not in file: continue
    with open(f'/data/daveting/other_project/explore/tfidf/stopwords/{file}', 'r') as f:
        lst = f.readlines()
        for w in lst: chn_stop_words.add(w.strip())

STOP_WORDS = set((
    "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
    "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
    "this", "then", "at", "have", "all", "not", "one", "has", "or", "that", "http", "cn"
))

for w in STOP_WORDS:
    chn_stop_words.add(w)
len(chn_stop_words)

2315

In [186]:
def clean_cut(x, mode='nodup'):
    if mode == 'nodup':
        lst = set(jieba.lcut(x))
    else:
        lst = jieba.lcut(x)
    return [w for w in lst if w not in chn_stop_words and re.search(u'[\u4e00-\u9fff0-9]', w)]

# \u4e00-\u9fff 中文
# a-zA-Z 英文
# 0-9 数字

In [187]:
def gen_idf(texts):
    # 单线程
    # cuts = [jieba.lcut(x) for x in texts]
    
    # 并行
    cuts = pd.Series(texts).parallel_apply(clean_cut).values
    
    corpus_cnt = Counter(list(itertools.chain.from_iterable(cuts)))
                
    idf = {w: np.log(len(cuts) / (corpus_cnt[w] + 1)) for w in corpus_cnt.keys()}
    return idf

$$
idf = \log(\frac{文档数}{出现该词的文档数+1})
$$

In [216]:
np.log(600000 / 600001)  # 礼物
np.log(100 / 100)  # 拓哉

-1.6666652777853802e-06

0.0

In [218]:
600000 / np.exp(0.91) 

241514.5344201816

In [188]:
def tfidf(sentence, idf, topK=5):
    bow = Counter(clean_cut(sentence, mode='dup'))
    cnt = sum(bow.values())
    res = [(bow[w] / cnt * idf[w], w) for w in bow.keys() if w in idf]
    res.sort(reverse=True)
    return [(x[1], x[0]) for x in res[:topK]]

# 木村拓哉

In [203]:
texts = []
for root, dirs, files in os.walk("/data/daveting/other_project/ad_hoc/Tiger/2022_weibo_crawler/posts/6883966016/"):
    for name in files:
        with open(os.path.join(root, name), 'r') as f:
            s = json.load(f)
        texts.append(s['text_raw'])
        
len(texts)

977

In [204]:
texts[:5]

['已经收到了祥君的礼物〜！谢谢🤙✨  拓哉 \u200b\u200b\u200b',
 'CM拍摄结束！很开心〜！👍✨ 拓哉 \u200b\u200b\u200b',
 'change‼️  TAK \u200b\u200b\u200b',
 '早上好！今天的拍摄也要全力以赴……。希望，大家也是一样！ 拓哉 \u200b\u200b\u200b',
 '啊〜‼️\n用了肌肉〜❗️🤪✨ 拓哉 \u200b\u200b\u200b']

In [205]:
idf = gen_idf(texts)
w = '拓哉'
idf[w]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=62), Label(value='0 / 62'))), HBox…

0.7554364419822376

In [208]:
idf['礼物']
idf['拓哉']

5.498192290922892

0.7554364419822376

In [211]:
# DIY IDF
tfidf(texts[0] + '拓哉拓哉拓哉拓哉', idf, topK=10)

[('了祥君', 0.6879266079425375),
 ('礼物', 0.610910254546988),
 ('收到', 0.5208068971896181),
 ('拓哉', 0.4196869122123542),
 ('谢谢', 0.3764421211751447)]

In [200]:
# Default IDF
analyse.extract_tags(texts[0], topK=10, withWeight=True)

[('了祥君', 1.9924612504833332),
 ('拓哉', 1.9924612504833332),
 ('礼物', 1.2777111543766666),
 ('谢谢', 1.2287866223916668),
 ('收到', 1.1132620922983334),
 ('已经', 0.57009144094)]

# 云南白药

In [212]:
df = pd.read_csv('/data/daveting/other_project/explore/tfidf/weibo_posts_云南白药_2013_2019.csv', low_memory=False)
df['shijian'] = pd.to_datetime(df['shijian'])
df.shape
df.head(2)

(1078359, 7)

Unnamed: 0,id,bid,zhengwen,zhuanfa,pinglun,dianzan,shijian
0,3535607235832469,zeWj1oti5,感情是长在身上的一块肉，我很爱护这块肉，有天，你硬生生的割下这块肉，我很痛，眼泪和血都模糊了...,2,4,0.0,2013-01-17 23:51:00
1,3535245321834772,zeMTi7Hj6,话说我好像后知后觉的发现了一件不得了的事情。云南白药广告的主题歌“云南白药创可贴，伤口好得快...,0,0,0.0,2013-01-16 23:53:00


In [213]:
tmp = df['zhengwen'].dropna().drop_duplicates()
tmp.shape

(601253,)

In [214]:
idf = gen_idf(tmp.values)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=37579), Label(value='0 / 37579')))…

In [220]:
idf['云南白药']
idf['股价']
idf['云南']
idf['白药']

0.9110868579591743

6.050473850303969

4.443155331104031

4.770363679654608

In [222]:
df2 = df.loc[df['shijian'].between(pd.to_datetime('2014-4-1'), pd.to_datetime('2014-4-20')), 'zhengwen'].dropna().drop_duplicates()
df2.shape
df2.sample(3)

(13841,)

68548    【云南白药正式承认其配方含断肠草之前对此讳莫如深】|中医讲究阴阳平衡以毒攻毒，是药三分毒。这...
67195    #云南白药含断肠草#我只说一个事情，比云南白药含断肠草知名度更高的：火腿肠含亚硝酸钠详情:h...
57795    发表了博文《云南白药中药守秘难解有毒质疑国家机密成挡箭牌》-云南白药中药守秘难解有毒质疑国家...
Name: zhengwen, dtype: object

In [223]:
tfidf('\n'.join(df2.values), idf, topK=10)

[('断肠草', 0.07794465723887414),
 ('含', 0.06708027737561896),
 ('配方', 0.054606919696410225),
 ('云南白药', 0.05254741777513055),
 ('草乌', 0.03699895559555723),
 ('成分', 0.032707275234727703),
 ('毒性', 0.03059372328422574),
 ('8sJF6bq', 0.02782037163972843),
 ('国家', 0.027022033671129764),
 ('公开', 0.024695028186425026)]

In [224]:
df2 = df.loc[df['shijian'].between(pd.to_datetime('2016-12-1'), pd.to_datetime('2016-12-31')), 'zhengwen'].dropna().drop_duplicates()
df2.shape
df2.sample(3)

(4930,)

680710    早上给胖子擦屁股，不知道怎么就出血了，一屁股血，吓得只想马上送她去医院。可是一堆逼逼歪歪的人...
678677    老妈太拼，发过来的锅中藏药，这药是1990年的云南白药还有纱布，爷爷在卫生局工作时医院送的，...
681315           上个班手机电脑看了一天了，眼睛👀好酸好累啊，赶快戴个云南白药眼罩睡觉觉了#叫我仙女#
Name: zhengwen, dtype: object

In [225]:
tfidf('\n'.join(df2.values), idf, topK=10)

[('云南白药', 0.04179077750735984),
 ('控股', 0.03850680925102201),
 ('混改', 0.035238800115016664),
 ('新华', 0.03357750254931906),
 ('漱口水', 0.02769692346611385),
 ('微商', 0.02395615686988324),
 ('白药', 0.02267237263057605),
 ('股权', 0.02261067240609939),
 ('眼罩', 0.02135390131733582),
 ('链接', 0.019501096933561914)]

In [228]:
df2 = df.loc[df['shijian'].between(pd.to_datetime('2018-10-20'), pd.to_datetime('2018-11-10')), 'zhengwen'].dropna().drop_duplicates()
df2.shape
df2.sample(3)

(10765,)

856587    云南白药配方不明，存在潜在隐患，应予以公布。可口可乐配方不明，存在潜在隐患，应予以公布。不过...
847529    //@白菜活动分享大V:反馈还可以叠加云南白药牙膏满99减3049-48券可叠加资生堂99-...
843665    //分享网易新闻:《女医生曝光云南白药牙膏含处方药疑似已经辞职》O女医生曝光云南白药牙膏含处...
Name: zhengwen, dtype: object

In [229]:
tfidf('\n'.join(df2.values), idf, topK=10)

[('牙膏', 0.07854289646656587),
 ('环酸', 0.06194941791652266),
 ('成分', 0.05646593153151725),
 ('氨甲', 0.048081808935425845),
 ('云南白药', 0.0464573058578903),
 ('西药', 0.03898107454646201),
 ('止血', 0.03582830950885373),
 ('配方', 0.02922376238315701),
 ('添加', 0.027398042362447916),
 ('处方药', 0.02692016416067864)]