In [2]:
%cd ..

/home/aistudio/work/NewsTitles


In [2]:
import os
import time
import paddle
import pandas as pd
import numpy as np
import paddlenlp
import paddle.nn.functional as F 
from tqdm import tqdm
from collections import defaultdict
from functools import partial
from paddle.io import Dataset, DataLoader
from paddlenlp.transformers import BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split



In [3]:
# 定义一些宏观变量

EPOCHS = 3
LEARNING_RATE = 5e-5
MODEL_NAME = "hfl/rbt4"
SAVE_PATH = './' + MODEL_NAME.split('/')[-1]
BATCH_SIZE = 1024
SAVE_FREQUENCE = 100
LOG_FREQUENCE = 20
TOTAL_SIZE = 83599
TEST_LEN1 = (TOTAL_SIZE//BATCH_SIZE)*BATCH_SIZE
TEST_SIZE = 0.10
RANDOM_STATE = 1024

## 数据统计

In [4]:
data_dict = defaultdict(int)
max_len, max_len_text = -float('INF'), ''
with open('./data/Train.txt', 'r') as f:
    for line in f.readlines():
        label, classify, title = line.strip('\n').strip(' ').split('\t')
        data_dict[classify]+=1
        if len(title) > max_len:
            max_len, max_len_text = len(title), title

print(f'最长的文本长度: {max_len}, 标题: {max_len_text}')

最长的文本长度: 48, 标题: 拉格利・希尔顿度假酒店Hilton MaldivesResort&amp;Spa Rangali


In [5]:
print('各个种类标题的数量:\n')
for key in data_dict.keys():
    print('\t', key, data_dict[key])

各个种类标题的数量:

	 财经 33389
	 彩票 6830
	 房产 18045
	 股票 138959
	 家居 29328
	 教育 37743
	 科技 146637
	 社会 45765
	 时尚 12032
	 时政 56778
	 体育 118444
	 星座 3221
	 游戏 21936
	 娱乐 83369


## 获取数据集

In [6]:
title_labels, classes, titles = [], [], []
with open('./data/Train.txt', 'r') as f:
    count, pre_class = 0, ''
    for line in f.readlines():
        label, classify, title = line.strip('\n').split('\t')
        title_labels.append(int(label))
        classes.append(classify)
        titles.append(title)

data = {'lable': title_labels,
        'class': classes,
        'title': titles}

train_data = pd.DataFrame(data)

In [7]:
train_data

Unnamed: 0,lable,class,title
0,0,财经,上证50ETF净申购突增
1,0,财经,交银施罗德保本基金将发行
2,0,财经,基金公司不裁员反扩军 走访名校揽人才
3,0,财经,基金巨亏30亿 欲打开云天系跌停自救
4,0,财经,基金市场周二缩量走低
...,...,...,...
752471,13,娱乐,胡彦斌为北京个唱彩排 现场传授减肥经(组图)
752472,13,娱乐,方大同薛凯琪拒评陈冠希 称其应尊重女性(组图)
752473,13,娱乐,美国资深记者透露迈克尔-杰克逊复出无望(图)
752474,13,娱乐,组图：小野猫妮可搭上F1总冠军 秀恩爱形影不离


In [8]:
test_title = []
with open('./data/Test.txt', 'r') as f:
    count = 0
    for line in f.readlines():
        test_title.append(line.strip('\n'))
test_data = pd.DataFrame({'title': test_title})
        

In [9]:
test_data

Unnamed: 0,title
0,北京君太百货璀璨秋色 满100省353020元
1,教育部：小学高年级将开始学习性知识
2,专业级单反相机 佳能7D单机售价9280元
3,星展银行起诉内地客户 银行强硬客户无奈
4,脱离中国的实际 强压人民币大幅升值只能是梦想
...,...
83594,Razer杯DotA精英挑战赛8月震撼登场
83595,经济数据好转吹散人民币贬值预期
83596,抵押率抵押物双控政策 刘明康支招房产贷款
83597,8000万像素 利图发布Aptus-II 12数码后背


In [10]:
title_with_labels = [(t, l) for t,l in zip(titles, title_labels)]
train_titles, val_titles = train_test_split(title_with_labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# 将test数据分割为两份，方便批次处理
test_data_part_1, test_data_part_2 = test_title[:TEST_LEN1], test_title[TEST_LEN1:]

In [11]:
print(len(val_titles))

75248


## 构造Dataset

In [12]:
class TextDataset(Dataset):
    def __init__(self, data,  tokenizer, max_seq_length=48, isTest=False):
        super(TextDataset, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.isTest = isTest

    def __getitem__(self, index):
        if  not self.isTest:
            text, label = self.data[index][0], self.data[index][1]
            encoded = self.tokenizer.encode(text, max_seq_len=self.max_seq_length, pad_to_max_seq_len=True)
            input_ids, token_type_ids  = encoded['input_ids'], encoded['token_type_ids']
            return tuple([np.array(x, dtype='int64') for x in [input_ids, token_type_ids, [label]]])
        else:
            title = self.data[index]
            encoded = self.tokenizer.encode(title, max_seq_len=self.max_seq_length, pad_to_max_seq_len=True)
            input_ids, token_type_ids  = encoded['input_ids'], encoded['token_type_ids']
            return tuple([np.array(x, dtype='int64') for x in [input_ids, token_type_ids]])

    def __len__(self):
        return len(self.data)

## 数据加载器

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=14)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset, val_dataset = TextDataset(data=train_titles, tokenizer=tokenizer), TextDataset(data=val_titles, tokenizer=tokenizer)
test_dataset_part1 = TextDataset(data=test_data_part_1, tokenizer=tokenizer, isTest=True)
test_dataset_part2 = TextDataset(data=test_data_part_2, tokenizer=tokenizer, isTest=True)

[2023-04-27 13:16:00,180] [    INFO] - We are using <class 'paddlenlp.transformers.roberta.modeling.RobertaForSequenceClassification'> to load 'hfl/rbt4'.
[2023-04-27 13:16:00,187] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/hfl/rbt4/rbt4_chn_large.pdparams
W0427 13:16:00.191088 19519 gpu_context.cc:244] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0427 13:16:00.195669 19519 gpu_context.cc:272] device: 0, cuDNN Version: 8.2.
[2023-04-27 13:16:06,273] [    INFO] - We are using <class 'paddlenlp.transformers.roberta.tokenizer.RobertaChineseTokenizer'> to load 'hfl/rbt4'.
[2023-04-27 13:16:06,276] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/hfl/rbt4/vocab.txt
[2023-04-27 13:16:06,292] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/hfl/rbt4/tokenizer_config.json
[2023-04-27 13:16:06,295] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/model

In [14]:
# 定义采样器
train_batch_sampler = paddle.io.BatchSampler(train_dataset,
                                        shuffle=True,
                                        batch_size=BATCH_SIZE,
                                        )

val_batch_sampler = paddle.io.BatchSampler(val_dataset,
                                        shuffle=True,
                                        batch_size=BATCH_SIZE,
                                        )

test_batch_sampler = paddle.io.BatchSampler(test_dataset_part1,
                                            shuffle=False, 
                                            batch_size=BATCH_SIZE)
# 定义数据加载器
train_data_loader = paddle.io.DataLoader(dataset=train_dataset,
                                        batch_sampler=train_batch_sampler,
                                        return_list=True,
                                        num_workers=4)
val_data_loader = paddle.io.DataLoader(dataset=val_dataset,
                                        batch_sampler=val_batch_sampler,
                                        return_list=True,
                                        num_workers=4)
test_data_loader = paddle.io.DataLoader(dataset=test_dataset_part1,
                                        batch_sampler=test_batch_sampler,
                                        return_list=True,
                                        num_workers=4)


## 定义评估函数

In [15]:
def evaluate(model, criterion, metric, data_loader):
    """
    Given a dataset, it evals model and computes the metric.

    Args:
        model(obj:`paddle.nn.Layer`): A model to classify texts.
        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
        metric(obj:`paddle.metric.Metric`): The evaluation metric.
    """
    model.eval()
    metric.reset()
    losses = []
    for batch in tqdm(data_loader):
        input_ids, token_type_ids, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
    accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.7f" % (np.mean(losses), accu))
    model.train()
    metric.reset()
    return accu

## 定义训练过程

In [None]:
# 定义优化器、损失函数和Acc计算器
optimizer = paddle.optimizer.Adam(learning_rate=LEARNING_RATE,
                            parameters=model.parameters(),
                            )
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 调整至训练模式
model.train() 
best_acc = 0.94

for epoch in range(EPOCHS):
    print(f"epoch: {epoch + 1}, {time.ctime()}")
    start_t = time.time()
    metric.reset()
    for ind, item in enumerate(train_data_loader()):
        if ind and (not ind%SAVE_FREQUENCE):
            accu = evaluate(model, criterion, metric, val_data_loader)
            if accu > best_acc:
                best_acc = accu
                print('\t Best Acc: {:.6f}'.format(accu))
                model.save_pretrained(SAVE_PATH)
                tokenizer.save_pretrained(SAVE_PATH)
        input_ids, token_type_ids, labels = item
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)

        correct = metric.compute(probs, labels)
        batch_acc = metric.update(correct)
        acc = metric.accumulate()
        
        loss.backward()
        ave_t = (time.time() - start_t)/(ind + 1)
        extra_h = ave_t*(len(train_data_loader) - ind + 1)/3600
        if ind and (not ind%LOG_FREQUENCE):
            print(f'\t step:{ind}/{len(train_data_loader)},', 'average time: {:.4f},'.format(ave_t), 'loss: {:.6f}'.format(loss.numpy()[0]), 'Batch Acc:{:.9f}, Acc:{:.9f}'.format(batch_acc, acc))

        optimizer.step()
        optimizer.clear_grad()
        # scheduler.step()

epoch: 1, Thu Apr 27 13:16:06 2023
	 step:20/662, average time: 0.7307, loss: 1.388521 Batch Acc:0.649414062, Acc:0.436058408
	 step:40/662, average time: 0.7155, loss: 0.753680 Batch Acc:0.772460938, Acc:0.586556784
	 step:60/662, average time: 0.7104, loss: 0.573487 Batch Acc:0.829101562, Acc:0.663117956
	 step:80/662, average time: 0.7082, loss: 0.417339 Batch Acc:0.875976562, Acc:0.712456597


100%|██████████| 74/74 [00:19<00:00,  3.75it/s]


eval loss: 0.33622, accu: 0.8998379
	 step:100/662, average time: 0.8986, loss: 0.387817 Batch Acc:0.879882812, Acc:0.879882812
	 step:120/662, average time: 0.8664, loss: 0.376510 Batch Acc:0.894531250, Acc:0.893275670
	 step:140/662, average time: 0.8436, loss: 0.298052 Batch Acc:0.916992188, Acc:0.897651486
	 step:160/662, average time: 0.8263, loss: 0.308648 Batch Acc:0.907226562, Acc:0.900678791
	 step:180/662, average time: 0.8129, loss: 0.268319 Batch Acc:0.918945312, Acc:0.903380594


100%|██████████| 74/74 [00:20<00:00,  3.66it/s]


eval loss: 0.24698, accu: 0.9240246
	 step:200/662, average time: 0.9007, loss: 0.306187 Batch Acc:0.907226562, Acc:0.907226562
	 step:220/662, average time: 0.8830, loss: 0.237749 Batch Acc:0.926757812, Acc:0.917131696
	 step:240/662, average time: 0.8683, loss: 0.294690 Batch Acc:0.914062500, Acc:0.915825076
	 step:260/662, average time: 0.8560, loss: 0.221337 Batch Acc:0.939453125, Acc:0.916864114
	 step:280/662, average time: 0.8453, loss: 0.229739 Batch Acc:0.927734375, Acc:0.918680073


100%|██████████| 74/74 [00:20<00:00,  3.65it/s]


eval loss: 0.21686, accu: 0.9320646
	 step:300/662, average time: 0.9021, loss: 0.282961 Batch Acc:0.920898438, Acc:0.920898438
	 step:320/662, average time: 0.8901, loss: 0.254603 Batch Acc:0.920898438, Acc:0.922665551
	 step:340/662, average time: 0.8793, loss: 0.256955 Batch Acc:0.920898438, Acc:0.924066311
	 step:360/662, average time: 0.8698, loss: 0.232090 Batch Acc:0.926757812, Acc:0.925445056
	 step:380/662, average time: 0.8613, loss: 0.256064 Batch Acc:0.920898438, Acc:0.925431617


100%|██████████| 74/74 [00:20<00:00,  3.68it/s]


eval loss: 0.20382, accu: 0.9363837
	 step:400/662, average time: 0.9026, loss: 0.232496 Batch Acc:0.927734375, Acc:0.927734375
	 step:420/662, average time: 0.8933, loss: 0.221265 Batch Acc:0.931640625, Acc:0.930013021


## 加载和推理

In [None]:
model_dict = paddle.load(os.path.join(SAVE_PATH, 'model_state.pdparams'))
inf_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=14)
inf_model.set_dict(model_dict)

In [None]:
# 使用前先再次评估
ev_acc = evaluate(inf_model, criterion, metric, val_data_loader)
with open('./record.txt', 'w+') as f:
    f.write(MODEL_NAME + '\t' + str(ev_acc) + '\n')

In [None]:
inf_model.eval()
res = []
for input_ids, token_type_ids in tqdm(test_data_loader):
    logits = inf_model(input_ids, token_type_ids)
    curr_ind = paddle.argmax(logits, axis=1)
    res += curr_ind.numpy().tolist()

for input_ids, token_type_ids in tqdm(test_dataset_part2):
    input_ids, token_type_ids = paddle.to_tensor(input_ids.reshape(1, 48) , dtype='int64'), paddle.to_tensor(token_type_ids.reshape(1, 48) , dtype='int64')
    logits = inf_model(input_ids, token_type_ids)
    curr_ind = paddle.argmax(logits, axis=1)
    res += curr_ind.numpy().tolist()

In [None]:
%rm -rf ./result.txt
class_lis = ['财经', '彩票', '房产', '股票', '家居', '教育', '科技', '社会', '时尚', '时政', '体育', '星座', '游戏', '娱乐']
label_dict = {ind: content for ind, content in enumerate(class_lis)}
assert len(res) == 83599, '最终输出的list长度不正确，需要检查test_data是否合理划分'
with open('./result.txt', 'w') as f:
    print('推理样例：')
    for i in range(83599):
        # text = label_dict[res[i]] + '\t' + test_data.iloc[i]['title'] + '\n'
        text = label_dict[res[i]] + '\n'
        if not i%100:
            print('\t', label_dict[res[i]] + '\t' + test_data.iloc[i]['title'])
        f.write(text)

## 最终分数 88.2分