In [1]:
import math
import jieba
from io import BytesIO
import base64
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
font_path = "C:/Windows/Fonts/simfang.ttf"  

In [2]:
# 读取文件
def read_txt(year):
    with open('reports/'+str(year)+'年政府工作报告.txt', 'r', encoding='utf-8') as f:
        file_content = f.read()
    return file_content

In [3]:
# 分词,保存为一个二维列表,长度为10,其中每个列表保存一个报告的分词结果
# 可通过修改range的范围来进行复用
def content_cut():
    txt_content_cuts_all = []
    jieba.load_userdict('self_userdict.txt')
    for year in range(2015,2025):
        txt_content = read_txt(year)
        txt_content_cuts = jieba.lcut(txt_content)
        txt_content_cuts_all.append(txt_content_cuts)
    return txt_content_cuts_all

In [4]:
# 加载停用词,使用的是百度的停用词表
def stopwords_load():
    stopwords=set()
    with open('baidu_stopwords.txt','r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        stopwords.add(line.strip())
    return stopwords

In [5]:
# 添加停用词,使用的停用词表有部分不全,自己进行补全
def stopwords_add(stopwords):
    with open('self_stopwords.txt','r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        stopwords.add(line.strip())
    return stopwords

In [6]:
# 删除停用词,遍历,将词频保存为字典,再将字典保存到一个列表中
def form_dict_list(txt_content_cuts_all):
    stopwords = stopwords_load()
    stopwords = stopwords_add(stopwords)
    stopwords.add('\n')
    stopwords.add(' ')
    stopwords.add('\u2002')
    keywords_dict_list = []
    for txt_content_cuts in txt_content_cuts_all:
        keywords_dict = {}
        for txt_content_cut in txt_content_cuts:
            if txt_content_cut not in stopwords:
                if txt_content_cut in keywords_dict:
                    keywords_dict[txt_content_cut] += 1
                else:
                    keywords_dict[txt_content_cut] = 1
        keywords_dict_list.append(keywords_dict)
    return keywords_dict_list

In [7]:
# 排序,按字典的值从大到小排序,将列表转换为三维
# 其实是我自己造成的bug
# 刚开始对题目理解错误，对词频进行了排序
# 本来里面保存的是字典，结果变成列表里一堆元组
# 后续计算还都是按元组来的，要是删掉就太麻烦了
def sorted_by_value(keywords_dict_list):
    sorted_dict_list = []
    for i in range(len(keywords_dict_list)):
        sorted_dict_list.append(sorted(keywords_dict_list[i].items(), key=lambda x: x[1], reverse=True))
    return sorted_dict_list

In [8]:
# 词集合，去重
def keywords_sum(keywords_dict_list):
    keyword_set = set()
    for i in range(len(keywords_dict_list)): 
        for j in range(len(keywords_dict_list[i])):
            keyword = keywords_dict_list[i][j][0]
            keyword_set.add(keyword)
    return keyword_set

In [9]:
# 查看某一词出现的文章数
def count_word_occurrences(keywords_dict_list, keyword):
    count = 0
    for i in range(len(keywords_dict_list)):
        for item in keywords_dict_list[i]:  
            if item[0] == keyword:
                count += 1
    return count

In [10]:
# 查看词集合中每个词出现的文章数
def generate_word_count_dict(keywords_dict_list, keyword_set):
    word_count_dict = {}
    for keyword in keyword_set:
        count = count_word_occurrences(keywords_dict_list, keyword)
        word_count_dict[keyword] = count
    return word_count_dict

In [11]:
# 计算tf-idf
def tf_idf_calculate():
    tf_idf_list = []
    keywords_dict_list = sorted_by_value(form_dict_list(content_cut()))
    keyword_set = keywords_sum(keywords_dict_list)
    word_count_dict = generate_word_count_dict(keywords_dict_list,keyword_set)
    for i in range(len(keywords_dict_list)):
        tf_idf = {}
        for j in range(len(keywords_dict_list[i])):
            keyword_count = keywords_dict_list[i][j][1]
            word_count = len(keywords_dict_list[i])
            log = math.log(len(keywords_dict_list)/word_count_dict[(keywords_dict_list[i][j][0])])
            tf_idf[keywords_dict_list[i][j][0]] = 1000*(keyword_count/word_count)*log
        tf_idf_list.append(tf_idf)
    return tf_idf_list

In [12]:
# 按TF-IDF值排序
def sort_tf_idf_list(tf_idf_list):
    sorted_list = []
    for i in range(len(tf_idf_list)):
        new_tf_idf_list = sorted(tf_idf_list[i].items(), key=lambda d: d[1], reverse=True)
        sorted_list.append(new_tf_idf_list)
    return sorted_list

In [13]:
# 找出关键词，这里规定排序前30的词为关键词
def find_keywords(sorted_tf_idf_list):
    result = []
    for sublist in sorted_tf_idf_list:
        pre_result = []
        count = 0
        for item in sublist:
            # 原本是前1%数量的词，不过有点少
            if item[1] > (sorted_tf_idf_list[count][int(len(sorted_tf_idf_list[count])*0.015)][1]):
                pre_result.append(item)
        result.append(pre_result)
        count += 1
    return result

In [14]:
# 将关键词的列表转换为字典，便于生成词云
def list_to_dict(result):
    word_freq_list = []
    for i in range(len(result)):
        word_freq = {}
        for word, freq in result[i]:
            word_freq[word] = freq
        word_freq_list.append(word_freq)
    return word_freq_list

In [15]:
# 生成词云图，并将其转换为 base64 编码，然后生成 HTML 文件
def gen_wordcloud_and_html(word_freq_list):
    html_content = '<html><body>'
    
    for i in range(len(word_freq_list)):
        wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=font_path).generate_from_frequencies(word_freq_list[i])
        
        img_buffer = BytesIO()
        wordcloud.to_image().save(img_buffer, format='PNG')
        
        img_str = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
        
        keywords_text = '\n'.join([f"{word}: {word_freq_list[i][word]}" for word in word_freq_list[i]])
        
        html_content += f'<h2>{i+2015}年词云图</h2>'
        html_content += f'<img src="data:image/png;base64,{img_str}" alt="{i+2015}年词云图">'
        html_content += f'<p style:"font = 20">关键词和TF-IDF值(扩大一千倍)：</p>'
        html_content += f'<pre>{keywords_text}</pre>'
    
    html_content += '</body></html>'
    
    with open('wordclouds.html', 'w') as html_file:
        html_file.write(html_content)

In [16]:
tf_idf_list = tf_idf_calculate()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\47226\AppData\Local\Temp\jieba.cache
Loading model cost 0.783 seconds.
Prefix dict has been built successfully.


In [17]:
sorted_tf_idf_list = sort_tf_idf_list(tf_idf_list)

In [18]:
len(sorted_tf_idf_list[0])

2469

In [19]:
result = find_keywords(sorted_tf_idf_list)

In [20]:
len(result[0])

19

In [21]:
word_freq_list = list_to_dict(result)

In [22]:
len(word_freq_list[0])

19

In [23]:
gen_wordcloud_and_html(word_freq_list)