In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset


class FeatureNameDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path, sep='_!_', header=None, engine='python', names=['id','feature_id', 'feature_name', 'content'])
        self.data = self.data.drop_duplicates(subset=['feature_id'])
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature_id = self.data.iloc[idx, 0]
        feature_name = self.data.iloc[idx, 1]

        sample = {'feature_id': feature_id, 'feature_name': feature_name}

        if self.transform:
            sample = self.transform(sample)

        return sample

class FeatureContentDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path, sep='_!_', header=None, engine='python', names=['id','feature_id', 'feature_name', 'content'])
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature_id = self.data.iloc[idx, 0]
        content = self.data.iloc[idx, 2]

        sample = {'feature_id': feature_id, 'content': content}

        if self.transform:
            sample = self.transform(sample)

        return sample

# 示例使用示范
# 用"_!_"作为分隔符读取CSV文件
file_path = '/home/liyi/gpt/data/toutiao_cat_data.txt'  # 请替换为您的文件路径
feature_name_dataset = FeatureNameDataset(file_path)

# 创建特征和内容对应关系的数据集
feature_content_dataset = FeatureContentDataset(file_path)

# 获取特征名称对应关系数据集的大小
print("Feature Name Dataset size:", len(feature_name_dataset))

# 获取特征和内容对应关系数据集的大小
print("Feature Content Dataset size:", len(feature_content_dataset))

# 获取第一个特征名称对应关系数据集的样本
first_feature_name_sample = feature_name_dataset[0]
print("First Feature Name Sample:", first_feature_name_sample)

# 获取第一个特征和内容对应关系数据集的样本
first_feature_content_sample = feature_content_dataset[0]
print("First Feature Content Sample:", first_feature_content_sample)

Feature Name Dataset size: 15
Feature Content Dataset size: 382688
First Feature Name Sample: {'feature_id': 101, 'feature_name': 'news_culture'}
First Feature Content Sample: {'feature_id': 101, 'content': '京城最值得你来场文化之旅的博物馆'}


In [2]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(feature_content_dataset, test_size=0.2, random_state=42)

# 获取训练集和测试集的大小
print("Train Dataset size:", len(train_dataset))
print("Test Dataset size:", len(test_dataset))

Train Dataset size: 306150
Test Dataset size: 76538


In [3]:
import jieba

def tokenize_chinese(text):
    return list(jieba.cut(text))


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese)
contents = [item['content'] for item in train_dataset]
vectorizer.fit_transform(contents)
vectorizer.transform('京城最值得你来场文化之旅的博物馆')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.306 seconds.
Prefix dict has been built successfully.


ValueError: Iterable over raw text documents expected, string object received.

In [5]:
print(vectorizer.transform(['京城最值得你来场文化之旅的博物馆']))

  (0, 107134)	0.09505972920299476
  (0, 87647)	0.5114845895888899
  (0, 84660)	0.208804151466844
  (0, 79887)	0.30393638164872777
  (0, 39380)	0.37424822892911064
  (0, 27859)	0.2765131215118262
  (0, 26506)	0.15514004706645676
  (0, 22345)	0.4650000889643618
  (0, 19370)	0.3695347136224028


In [22]:
from torch.utils.data import DataLoader
import numpy as np 

# this collate function gets list of batch_size tuples, and needs to 
# return a pair of label-feature tensors for the whole minibatch
def bowify(b):
    contents = [item['content'] for item in b] 
    return (
            torch.LongTensor([t['feature_id']-100 for t in b]),
            torch.tensor(vectorizer.transform(contents).toarray(), dtype=torch.float32)
    )

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=bowify, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=bowify, shuffle=True)
input_len = len(vectorizer.get_feature_names())

In [21]:
first_15_samples = [train_dataset[i] for i in range(15)]
contents = [item['content'] for item in first_15_samples] 
print(contents)
result = vectorizer.transform(contents)
dense_matrix = result.toarray()
tensor = torch.tensor(dense_matrix, dtype=torch.float32)


['大学宿舍里最怕碰上这种人，简直无法容忍', '伊朗议员议会时烧毁美国国旗“处死美国”', '关西机场落地，如何快速方便的直接去京都？', '农村好多县城买房，什么原因？', '上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？', '请吃饭的孙艺珍姐姐，又在电影里狠狠虐了我一把！', '十五万块钱能入手的六款汽车，众泰SR9在列，你为哪一款疯狂？', '这豪车当初提车加价20万，国产后降10万却遭唾弃，奥迪宝马都笑了', '中国海军所有舰艇在海上遇到它都得鸣笛致敬，航母也不例外！', '烟台的特产有什么，有去过养马岛的吗？', '定了！红河县奕车“姑娘节”就在下周三！浪漫之旅穿越千年不老时光，你约吗？', '鲁HSM757、皖J65649、浙B0C809……敢在高速上做这种事？坚决严查！曝光！', '汇易资讯：国内部分油厂豆粕库存与未执行合同统计（18年第18周）', '为什么说鲁迅曾用薛宝钗影射高士奇？', '这才是亲情！她九岁摔倒致瘫，哥哥和弟弟不离不弃照顾近半个世纪']


In [23]:
net = torch.nn.Sequential(torch.nn.Linear(input_len,17),torch.nn.LogSoftmax(dim=1))
def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.NLLLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,features in dataloader:
        optimizer.zero_grad()
        out = net(features)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

In [25]:
train_epoch(net,train_loader,epoch_size=30000)

3200: acc=0.7975
6400: acc=0.8015625
9600: acc=0.808125
12800: acc=0.80953125
16000: acc=0.810875
19200: acc=0.8115104166666667
22400: acc=0.8127678571428572
25600: acc=0.8148046875
28800: acc=0.8161111111111111


(0.04906961811122609, 0.8168976545842217)

In [26]:
loss_fn = torch.nn.NLLLoss()

def test_model(net, dataloader):
    net.eval()  # 设置模型为评估模式
    total_loss, acc, count = 0, 0, 0
    with torch.no_grad():  # 禁用梯度计算
        for labels, features in dataloader:
            out = net(features)
            loss = loss_fn(out, labels)
            total_loss += loss
            _, predicted = torch.max(out, 1)
            acc += (predicted == labels).sum()
            count += len(labels)

    avg_loss = total_loss.item() / count
    accuracy = acc.item() / count
    print(f"Test Loss: {avg_loss}, Test Accuracy: {accuracy}")
    return avg_loss, accuracy

test_loss, test_accuracy = test_model(net, test_loader)

Test Loss: 0.043203184075026456, Test Accuracy: 0.820468264130236


In [29]:
input_tensor = vectorizer.transform(['没有爱，就没有教育！江门开平一幼儿园收到家长亲笔写的“情书”_!_美丽中国,幼儿园,牛津,江门,Nini,感谢信'])
dense_matrix = input_tensor.toarray()
tensor_2d = torch.tensor(dense_matrix, dtype=torch.float32)

def getFeatureName(feature_id):
    return next((item for item in feature_name_dataset if item['feature_id'] == feature_id), None)
        
with torch.no_grad():  # 禁用梯度计算
    out = net(tensor_2d)
    probabilities = torch.exp(out)
    predicted_class = torch.argmax(probabilities, dim=1)
    print(predicted_class)
    print(getFeatureName(100+predicted_class))

tensor([8])
{'feature_id': 108, 'feature_name': 'news_edu'}


In [30]:
def get_class(text):
    input_tensor = vectorizer.transform([text])
    dense_matrix = input_tensor.toarray()
    tensor_2d = torch.tensor(dense_matrix, dtype=torch.float32)
    with torch.no_grad():
        out = net(tensor_2d)
        probabilities = torch.exp(out)
        predicted_class = torch.argmax(probabilities, dim=1)
        print(predicted_class)
        print(getFeatureName(100+predicted_class))  

In [31]:
get_class('佟丽娅和陈思成离婚了')

tensor([2])
{'feature_id': 102, 'feature_name': 'news_entertainment'}


In [32]:
get_class('佟丽娅又结婚了')

tensor([2])
{'feature_id': 102, 'feature_name': 'news_entertainment'}


In [33]:
get_class('梅西又获得了金球奖')

tensor([3])
{'feature_id': 103, 'feature_name': 'news_sports'}


In [37]:
get_class('2024年的男篮世界杯开始')

tensor([3])
{'feature_id': 103, 'feature_name': 'news_sports'}
