In [3]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import SnowballStemmer  # 使用SnowballStemmer处理中文
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 首先，确保你已经下载了NLTK的punkt分词器
nltk.download('punkt')

# 初始化TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()

# 初始化SnowballStemmer词干提取器，用于处理中文
snowball = SnowballStemmer('english')

# 用于存储所有文档的内容的列表
corpus = []

# 读取所有的txt文件
for filename in os.listdir('reports'):
    if filename.endswith(".txt"):
        with open(os.path.join('reports', filename), 'r', encoding='utf-8') as file:
            text = file.read()
            corpus.append(text)

# 对每个文档进行预处理：分词、去停用词、词干提取
processed_corpus = []
# 中文停用词表
chinese_stop_words = set()
with open('baidu_stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        chinese_stop_words.add(line.strip())
        
for document in corpus:
    # 分词
    words = word_tokenize(document.lower())
    # 去停用词和标点符号
    words = [word for word in words if word.isalnum() and word not in chinese_stop_words]
    # 词干提取（中文不需要词干提取，这里只是为了示例）
    words = [snowball.stem(word) for word in words]
    processed_corpus.append(' '.join(words))

processed_corpus
    
# 计算TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)

# 输出关键词及其TF-IDF值
feature_names = tfidf_vectorizer.get_feature_names_out()
for i, document in enumerate(processed_corpus):
    print(f"Top words in document {i + 1}:")
    scores = {feature_names[index]: score for index, score in zip(tfidf_matrix[i].indices, tfidf_matrix[i].data)}
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_scores[:10]:
        print(f"\tWord: {word}, TF-IDF: {score:.4f}")


Top words in document 1:
	Word: 营改增, TF-IDF: 0.3440
	Word: 三公, TF-IDF: 0.3259
	Word: 三农, TF-IDF: 0.2403
	Word: 约法三章, TF-IDF: 0.2403
	Word: 舌尖上的安全, TF-IDF: 0.2023
	Word: 零就业, TF-IDF: 0.2023
	Word: 三个1亿人, TF-IDF: 0.2023
	Word: 空心村, TF-IDF: 0.2023
	Word: 四两拨千斤, TF-IDF: 0.2023
	Word: 三网融合, TF-IDF: 0.2023
Top words in document 2:
	Word: 十二五, TF-IDF: 0.3251
	Word: 营改增, TF-IDF: 0.2787
	Word: 三农, TF-IDF: 0.1947
	Word: 约法三章, TF-IDF: 0.1947
	Word: 三公, TF-IDF: 0.1760
	Word: 左右和5, TF-IDF: 0.1639
	Word: 草根, TF-IDF: 0.1639
	Word: 三个支撑带, TF-IDF: 0.1639
	Word: 独角戏, TF-IDF: 0.1639
	Word: 三网, TF-IDF: 0.1639
Top words in document 3:
	Word: 十三五, TF-IDF: 0.3986
	Word: 一带一路, TF-IDF: 0.3311
	Word: 三大战略, TF-IDF: 0.2726
	Word: 十二五, TF-IDF: 0.2703
	Word: 三严三实, TF-IDF: 0.2317
	Word: 三农, TF-IDF: 0.1619
	Word: 我们将全面准确贯彻, TF-IDF: 0.1363
	Word: 七五, TF-IDF: 0.1363
	Word: 和8, TF-IDF: 0.1363
	Word: 海外仓, TF-IDF: 0.1363
Top words in document 4:
	Word: 双创, TF-IDF: 0.5095
	Word: 一带一路, TF-IDF: 0.2774
	Word: 五险一金, TF-IDF: 0.

[nltk_data] Downloading package punkt to
[nltk_data]     E:\GameDownload\Anaconda\envs\personal\share\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
