In [1]:
import os
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def get_stopwords(stopword_path='stopwords.txt'):
    with open(stopword_path,'r',encoding='GBK') as f:
        stopwords=[line.strip() for line in f.readlines()]
    return stopwords

def read_documents(folder_path='TFIDF',stopwords=None):
    documents=[] 
    filenames=[] 

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  
            file_path=os.path.join(folder_path,filename)
            with open(file_path,'r',encoding='utf-8') as f:
                content=f.read()     
                words=jieba.cut(content)
                filtered_words=[word for word in words if word.strip() and word not in stopwords]
                documents.append(' '.join(filtered_words))
                filenames.append(filename)
    return documents,filenames

In [None]:
if __name__=='__main__':
    stopwords=get_stopwords()

    documents,filenames=read_documents(stopwords=stopwords)

    if not documents:
        print('未找到任何txt文件，请检查文件路径！')
    else:
        tfidf_vectorizer=TfidfVectorizer()
        tfidf_matrix=tfidf_vectorizer.fit_transform(documents)
        feature_names=tfidf_vectorizer.get_feature_names_out()

        for i,filename in enumerate(filenames):
            print(f'n===== 文档{filename}的Top50 TF-IDF词=====')

            tfidf_scores=tfidf_matrix[i].toarray().flatten()
            word_scores=sorted(zip(feature_names,tfidf_scores),key=lambda x: x[1],reverse=True)
            for word,score in word_scores[:50]:
                if score>0:
                    print(f'{word}:{score:.4f}')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Lenovo\AppData\Local\Temp\jieba.cache


Loading model cost 0.925 seconds.
Prefix dict has been built successfully.


n===== 文档古代言情.txt的Top TF-IDF词=====
玫瑰:0.4119
666:0.3735
鼓掌:0.2535
得不到:0.2190
重生:0.2175
爱心:0.2016
微笑:0.1805
皇帝:0.1795
女主:0.1781
和亲:0.1774
流泪:0.1479
感谢:0.1431
男人:0.1402
好看:0.1378
短剧:0.1335
摄政王:0.1298
金币:0.1143
结局:0.1066
喜欢:0.1008
广告:0.0970
皇上:0.0943
将军:0.0930
男主:0.0922
公主:0.0898
发呆:0.0806
剧情:0.0802
孩子:0.0782
景甜:0.0703
想要:0.0667
不错:0.0658
永远:0.0610
两个:0.0605
加油:0.0552
裴时安:0.0541
可汗:0.0519
一圈:0.0517
一世:0.0502
红旗:0.0480
皇叔:0.0476
骚动:0.0465
穿越:0.0461
漂亮:0.0456
霜儿:0.0443
一集:0.0442
抱抱:0.0442
真的:0.0408
妹妹:0.0404
王爷:0.0404
发怒:0.0403
太后:0.0381
n===== 文档奇幻.txt的Top TF-IDF词=====
666:0.5677
玫瑰:0.5552
短剧:0.2626
鼓掌:0.2341
爱心:0.1986
好看:0.1878
感谢:0.1779
加油:0.1198
剧情:0.1133
喜欢:0.0787
推荐:0.0774
值得:0.0708
不错:0.0679
画面:0.0631
老祖宗:0.0617
这部:0.0554
一集:0.0535
真的:0.0521
制作:0.0511
停不下来:0.0498
精彩:0.0490
广告:0.0482
津津有味:0.0455
少帅:0.0430
评论:0.0428
雪球:0.0391
龙君:0.0365
情节:0.0363
互动:0.0347
金子:0.0332
救人:0.0320
欲罢不能:0.0297
能量:0.0295
为国争光:0.0295
每一集:0.0286
紧凑:0.0286
演员:0.0284
精良:0.0284
塑造:0.0278
下边:0.0276
充满:0.0274
演技:0.02