In [8]:
import os
import re
import json
import pandas as pd
import numpy as np
import jieba
import jiagu
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# 1. 对新闻文本数据进行预处理

In [2]:
# 获得新闻文本数据
# 获取新闻文本数据文件路径
news_path_ori = './news/data'
news_data = []
# 只提取跟评论数据相同时间段的新闻文本数据
for i in os.listdir(news_path_ori):
    news_path = os.path.join(news_path_ori, i)
    with open(news_path, 'r', encoding='utf-8') as f:
        datas = json.load(f)
        for data in datas:
            dic = {}
            dic['新闻发布日期'] = '2020-' + data['time'].split()[0]
            dic['新闻发布时间'] = data['time'].split()[1]
            dic['新闻url'] = data['url']
            dic['新闻标题'] = data['meta']['title'].replace(' ', '')
            dic['新闻正文'] = data['meta']['content'].replace(' ', '')
            dic['新闻描述'] = data['meta']['description'].replace(' ', '')
            dic['新闻关键词'] = ' '.join(data['meta']['keyword'])
            dic['新闻类型'] = data['meta']['type'].replace(' ', '')
            news_data.append(dic)
print('合计获取到{}条新闻文本数据'.format(len(news_data)))

合计获取到502550条新闻文本数据


In [3]:
# 转成DataFrame格式，剔除content无空的数据，并进行去重
news_data_df = pd.DataFrame(news_data)
news_data_df = news_data_df.fillna('')
news_data_df = news_data_df[news_data_df['新闻正文']!='']
news_data_df = news_data_df[news_data_df['新闻url']!=''] 
news_data_df.drop_duplicates(subset=['新闻url'], inplace=True)
print('去重后，剩余{}条新闻文本数据'.format(len(news_data_df)))
news_data_df.head()

去重后，剩余499649条新闻文本数据


Unnamed: 0,新闻发布日期,新闻发布时间,新闻url,新闻标题,新闻正文,新闻描述,新闻关键词,新闻类型
0,2020-02-27,00:00,https://finance.sina.com.cn/money/future/roll/...,弱成本&amp;弱供需PTA顺势下行,热点栏目自选股数据中心行情中心资金流向模拟交易客户端原标题：【PTA】弱成本&弱供需PTA顺...,弱成本&amp;弱供需PTA顺势下行,PTA 下行,news
1,2020-02-27,02:32,https://finance.sina.com.cn/chanjing/cyxw/2020...,英媒：中国养蜂人遭“封锁”新冠病毒“蜇疼”蜂蜜生产国,原标题：英媒：新冠病毒“蜇疼”中国养蜂人路透社2月26日文章，原题：随着中国养蜂人遭“封锁”...,英媒：中国养蜂人遭“封锁”新冠病毒“蜇疼”蜂蜜生产国,疫情 新冠肺炎,news
2,2020-02-27,02:32,https://finance.sina.com.cn/world/gjjj/2020-02...,美媒：中国超级富豪人数超过美印之和,原标题：美媒：中国超级富豪人数超过美印之和雅虎财经网2月26日文章，原题：胡润全球富豪榜：中...,美媒：中国超级富豪人数超过美印之和,,news
3,2020-02-27,02:32,https://finance.sina.com.cn/world/gjjj/2020-02...,英媒：疫情导致中国的野生动物养殖面临困境,原标题：英媒：中国的野生动物养殖面临困境英国《卫报》2月25日文章，原题：因新冠病毒导致的野...,英媒：疫情导致中国的野生动物养殖面临困境,野生动物 新冠肺炎,news
4,2020-02-27,02:33,https://news.sina.com.cn/w/2020-02-27/doc-iimx...,印度：停职官员要坐“反省椅”等待最后“判决”,原标题：印度：停职官员要坐“反省椅”[环球时报综合报道]“停职不是带薪假。”印度中央邦的瓜廖...,停职官员要坐“反省椅”,印度,news


## 1.1为没有关键词的新闻提取关键词
- 对关键词为空的新闻正文，使用jiagu的findword接口进行新词发现
- 将新词加载到jieba的用户词典
- 使用jiagu的keywords接口进行关键词的提取

In [7]:
# 查看有多少新闻的关键词为空
print('缺失关键词的数据量为:', len(news_data_df[news_data_df['新闻关键词']=='']))

缺失关键词的数据量为: 130451


In [16]:
# 新词发现
no_keyword_news_data = list(news_data_df[news_data_df['新闻关键词']=='']['新闻正文'])
no_keyword_news_data = list(news_data_df[news_data_df['新闻关键词']=='']['新闻正文'])
with open('news/preprocess_data/no_keywords_text.txt', 'w', encoding='utf8') as f:
    for i in range(len(no_keyword_news_data)):
        f.write(no_keyword_news_data[i])
        f.write('\n')

In [17]:
jiagu.findword('news/preprocess_data/no_keywords_text.txt', 'news/preprocess_data/new_words.txt')

In [55]:
# 读取jieba的词典数据
jieba_lst = []
with open('/Users/apple/opt/anaconda3/envs/pytorch/lib/python3.7/site-packages/jieba/dict.txt', 'r') as f:
    for i in f.readlines():
        jieba_lst.append(i.strip().split(' ')[0])
jieba_lst = set(jieba_lst)
# 取获得前3000个新词
new_words = []
with open('news/preprocess_data/new_words.txt', 'r') as f:
    for i in f.readlines():
        new_words.append(i.strip().split('\t')[0])
new_words = new_words[:3000]
new_words = set(new_words)
# 把新词中在jieba词典里的词剔除
diff = []
for i in new_words:
    if i not in jieba_lst:
        diff.append(i)
print(len(diff))
# 将获得新词保存
with open('./news/preprocess_data/new_word_dict.txt', 'w', encoding='utf-8') as f:
    for i in diff:
        f.write(i + '\n')

625


In [57]:
# 对发现的新词进行筛选后，得到261个新词，导入jieba的用户词典中
jieba.load_userdict('./news/preprocess_data/new_word_dict.txt')
# 测试
for word in jieba.cut('受新冠疫情影响，宁德时代董事长说，本次经济下行已成定局'):
    print(word)

受
新冠疫情
影响
，
宁德时代
董事长
说
，
本次
经济
下行
已成定局


In [58]:
# 查看存在的关键词的数量分布，发现中位数以及平均数为7，因此选择7作为关键词数量进行提取
news_data_df[news_data_df['新闻关键词']!='']['新闻关键词'].apply(lambda x: len(x)).describe()

count    369198.000000
mean          7.586447
std           4.132212
min           1.000000
25%           4.000000
50%           7.000000
75%          10.000000
max          56.000000
Name: 新闻关键词, dtype: float64

In [80]:
# 选择数量7进行关键词数量进行提取
# 加载停用词表
stop_words = set()

with open('utils/stop_words.txt', 'r', encoding='utf8') as f:
    for line in f:
        word = line.strip()
        if not word:
            continue
        stop_words.add(word)


def textrank_keyword(text, top_k=7):
    '''使用jiagu的textrank接口进行关键词提取'''
    text = text.replace(' ', '')
    text = [word for word in jieba.cut(text) if word not in stop_words]
    keywords = jiagu.keywords(''.join(text), top_k)
    keywords = ' '.join(keywords)
    return keywords

In [82]:
# 使用pandarallel进行加速
news_data_df.loc[news_data_df['新闻关键词']=='', '新闻关键词'] = \
                news_data_df[news_data_df['新闻关键词']=='']['新闻正文'].parallel_apply(textrank_keyword)
print(len(news_data_df[news_data_df['新闻关键词']=='']))

0


In [83]:
# 将新闻发布日期、新闻发布url、新闻关键词进行保存
news_data_save = news_data_df[['新闻发布日期', '新闻url', '新闻标题', '新闻关键词']]
news_data_save.to_csv('./news/preprocess_data/新闻正文关键词.csv', index=False)

### 查看目前所有关键词，并对出现较高的那1000个关键词提取出来进行人工筛查，将一些有问题的关键词剔除

In [84]:
top1000_keyword = {}
for keywords in news_data_save['新闻关键词']:
    for i in keywords.split():
        top1000_keyword[i] = top1000_keyword.get(i, 0) + 1
top1000_keyword = sorted(top1000_keyword.items(), key=lambda d: d[1], reverse=True)[:1000]

In [85]:
with open('./news/preprocess_data/关键词-待筛选.txt', 'w', encoding='utf8') as f:
    for i in top1000_keyword:
        f.write('{}\t{}\n'.format(i[0], i[1]))

In [86]:
# 经过筛选后，有749个关键词保留下来
keyword_need = []
with open('./news/preprocess_data/关键词-已筛选.txt', 'r') as f:
    for line in f:
        keyword_need.append(line.split('\t')[0])

In [87]:
keyword_not_need = np.setdiff1d(np.array([word[0] for word in top1000_keyword]), np.array(keyword_need))
print('需要剔除的关键词有{}个'.format(len(keyword_not_need)))

需要剔除的关键词有330个


In [89]:
with open('./news/preprocess_data/关键词-待剔除.txt', 'w') as f:
    for i in keyword_not_need:
        f.write('{}\n'.format(i))

### 将news_data_save中的需要剔除的关键词剔除

In [90]:
def filter_keyword(keywords):
    keywords = keywords.split()
    keywords = np.setdiff1d(np.array(keywords), np.array(keyword_not_need))
    return ' '.join(keywords)

news_data_save['新闻关键词'] = news_data_save['新闻关键词'].map(filter_keyword)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# 2. 对新闻关键词计算权值、排序，并制作词云图所需数据

In [91]:
# 考虑到每条新闻可能存在多个关键词，如果直接对关键词进行计数，会使得一些新闻关键词的权值降低，不能更好的反映这个关键词的重要性
# 因此我们设定每条新闻的关键词的总权值为1，每条新闻中每个关键词的权值为1/该条新闻的关键词数量
# 另外，按照日期进行关键词统计
date_num = news_data_save['新闻发布日期'].unique()
news_data_save['关键词长度'] = news_data_save['新闻关键词'].apply(lambda x: len(x.split()))
keywords_dict_date = {}
for news_date in date_num:
    keywords_dict_date[news_date] = {}
    news_data_day = news_data_save[news_data_save['新闻发布日期']==news_date]
    for i in range(len(news_data_day['新闻关键词'])):
        keywords = news_data_day['新闻关键词'].iloc[i].split()
        keyword_len = news_data_day['关键词长度'].iloc[i]
        for keyword in keywords:
            keywords_dict_date[news_date][keyword] = keywords_dict_date[news_date].get(keyword, 0) + 1/keyword_len

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [92]:
# 对每个日期的关键词按照权重进行倒序排序
for news_date in keywords_dict_date.keys():
    keywords_dict_date[news_date] = sorted(keywords_dict_date[news_date].items(), 
                                           key=lambda d: d[1], reverse=True)

In [93]:
# 由于每个日期的每个关键词的权重都不一样，需要进行归一化操作
def normalization_keyword(keyword_lst):
    # 以最大值作为1值，其他值 / 最大值 * 10
    rank_1 = keyword_lst[0][1]
    keyword_lst_norm = []
    for i in range(len(keyword_lst)):
        keyword_lst_norm.append((keyword_lst[i][0], keyword_lst[i][1] / rank_1 * 10))
    return keyword_lst_norm

for news_date in keywords_dict_date.keys():
    keywords_dict_date[news_date] = normalization_keyword(keywords_dict_date[news_date])

In [97]:
# 仅提取每天前10个权重值的关键词进行保存，以便于绘制词云图
top_k=10
keywords_dict_date_topk = {}
for news_date in keywords_dict_date.keys():
    keywords_dict_date_topk[news_date] = [{'name': data[0], 'value': data[1]} for data in keywords_dict_date[news_date][:top_k]]

with open('./report/echarts图表/js/news_keywords_top{}.js'.format(top_k), 'w', encoding='utf8') as f:
    f.write('const keywords = ')
    f.write(str(keywords_dict_date_topk))

In [105]:
# 提取日期，以便于绘制词云图
with open('./report/echarts图表/js/news_date.js', 'w', encoding='utf8') as f:
    f.write('const news_date = ')
    f.write(str(sorted(list(keywords_dict_date.keys()))))

- 可以看到在2020-01-19之前，新闻关键词基本以伊朗、特朗普为主，说明当时的新闻以国际形势为主
- 但是从2020-01-19开始，关于疫情、湖北、新冠的关键词开始慢慢增多，并开始占据较大范围
- 通过新闻关键词，深刻反映了那段时期，中国新闻报道的主流趋势
- 具体见`report/echarts图表/新闻文本关键词词云.html`
![](report/gif/新闻文本关键词词云.gif)