In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset


class FeatureNameDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path, sep='_!_', header=None, engine='python', names=['id','feature_id', 'feature_name', 'content'])
        self.data = self.data.drop_duplicates(subset=['feature_id'])
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature_id = self.data.iloc[idx, 0]
        feature_name = self.data.iloc[idx, 1]

        sample = {'feature_id': feature_id, 'feature_name': feature_name}

        if self.transform:
            sample = self.transform(sample)

        return sample

class FeatureContentDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path, sep='_!_', header=None, engine='python', names=['id','feature_id', 'feature_name', 'content'])
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature_id = self.data.iloc[idx, 0]
        content = self.data.iloc[idx, 2]

        sample = {'feature_id': feature_id, 'content': content}

        if self.transform:
            sample = self.transform(sample)

        return sample

# 示例使用示范
# 用"_!_"作为分隔符读取CSV文件
file_path = '/home/liyi/gpt/data/toutiao_cat_data.txt'  # 请替换为您的文件路径
feature_name_dataset = FeatureNameDataset(file_path)

# 创建特征和内容对应关系的数据集
feature_content_dataset = FeatureContentDataset(file_path)

# 获取特征名称对应关系数据集的大小
print("Feature Name Dataset size:", len(feature_name_dataset))

# 获取特征和内容对应关系数据集的大小
print("Feature Content Dataset size:", len(feature_content_dataset))

# 获取第一个特征名称对应关系数据集的样本
first_feature_name_sample = feature_name_dataset[0]
print("First Feature Name Sample:", first_feature_name_sample)

# 获取第一个特征和内容对应关系数据集的样本
first_feature_content_sample = feature_content_dataset[0]
print("First Feature Content Sample:", first_feature_content_sample)

Feature Name Dataset size: 15
Feature Content Dataset size: 382688
First Feature Name Sample: {'feature_id': 101, 'feature_name': 'news_culture'}
First Feature Content Sample: {'feature_id': 101, 'content': '京城最值得你来场文化之旅的博物馆'}


In [2]:
import jieba

def tokenize_chinese(text):
    return list(jieba.cut(text))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

contents = [item['content'] for item in list(feature_content_dataset)]
vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese,max_features=1000)
tfidf = vectorizer.fit_transform(contents)


Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.454 seconds.
Prefix dict has been built successfully.


In [26]:
print(tfidf)

  (0, 729)	0.1919137926760878
  (0, 583)	0.6153766991735234
  (0, 201)	0.31304810965594043
  (0, 210)	0.5559098773003782
  (0, 618)	0.4212430458247339
  (1, 614)	0.5265383901465521
  (1, 754)	0.494093328346405
  (1, 343)	0.4445560145181995
  (1, 998)	0.25620815290956606
  (1, 345)	0.35055759673965003
  (1, 629)	0.2620609268229087
  (1, 729)	0.15423968660708226
  (2, 101)	0.35335662904173265
  (2, 434)	0.31793793834985284
  (2, 494)	0.29989296330276566
  (2, 63)	0.46513285000784294
  (2, 989)	0.5279296132114248
  (2, 996)	0.21801750084173982
  (2, 98)	0.35194877880244796
  (2, 998)	0.1328945160486272
  (3, 133)	0.39352412509098195
  (3, 936)	0.4477566548963821
  (3, 808)	0.4551460815536073
  (3, 149)	0.23376519947962657
  (3, 552)	0.5301782562850105
  :	:
  (382684, 403)	0.3329202409205295
  (382684, 995)	0.14526184979639367
  (382684, 998)	0.1518670293670059
  (382685, 869)	0.547315786998123
  (382685, 608)	0.511180541894453
  (382685, 297)	0.40192960644288533
  (382685, 60)	0.27814594

In [12]:
import re

# 获取词袋模型中的所有词语  
words_dict = {}
feature_names = vectorizer.get_feature_names()
    
# 定义一个正则表达式，用于匹配中文字符
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')

# 过滤出仅包含中文字符的特征名称
word = [word for word in feature_names if chinese_char_pattern.fullmatch(word)]

    # 将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重 

    # 打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for遍历某一类文本下的词语权重  
for i in range(1000):
    for j in range(len(word)):  
        if word[j] in words_dict:
            words_dict[word[j]] += tfidf[i,j]
        else:
            words_dict[word[j]] = tfidf[i,j]


In [29]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (520 kB)
[K     |████████████████████████████████| 520 kB 25 kB/s eta 0:00:01
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3


In [15]:
from PIL import Image, ImageSequence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

def showCloud(wordDict, filename):
    # 根据图片创建 graph 为 nd-array 类型

    # 创建 wordcloud 对象，背景图片为 graph，背景色为白色
    
    wc = WordCloud(background_color = 'white', font_path='/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf')
    # 生成词云
    wc.generate_from_frequencies(wordDict)
    # 根据 graph 生成颜色
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc.recolor(color_func=image_color)) #对词云重新着色
    plt.axis('off')
    # 显示词云图，并保存为 jpg 文件
    #plt.show()
    wc.to_file(filename + ".jpg")
    plt.clf()
    
showCloud(words_dict, "xiamen_tfidf")

ValueError: Only supported for TrueType fonts

In [14]:
print(words_dict)

{'一': 22.77738534707426, '一下': 3.697136745872294, '一个': 0.7237948557091609, '一些': 5.86659091111191, '一位': 0.0, '一句': 1.4967367939314848, '一场': 3.3939646194497475, '一天': 5.189383423481331, '一定': 3.6022199523634075, '一年': 0.8012315526992602, '一怒之下': 0.0, '一样': 1.413886750282566, '一次': 0.3456655651711078, '一款': 0.0, '一点': 0.0, '一直': 1.394746379917252, '一种': 1.631725165722849, '一般': 0.7263852183932527, '一起': 2.8351339217337888, '万': 0.88323210065083, '万元': 3.7075820508456685, '三': 2.2831434795826797, '三个': 1.7440394666548051, '上': 1.4261130259318233, '上市': 8.416455292177544, '上榜': 5.719278165702857, '上海': 0.0, '上涨': 3.3111168214105975, '上港': 0.9251587489585834, '上线': 7.481767603356363, '上联': 1.9555418372244477, '上课时': 1.3834258939728021, '下': 0.0, '下联': 4.1548052849413954, '不': 0.9707320469532705, '不会': 4.228976686095899, '不值钱': 1.0148100479323687, '不停': 2.83918115302614, '不再': 0.4997501848117357, '不到': 1.98041922172226, '不同': 0.7793468402777795, '不好': 0.0, '不如': 0.5983469232534919, '不敢': 