In [1]:
cd ..

/home/aistudio/work/NewsTitles


In [2]:
import os
import time
import paddle
import paddle.nn as nn
import numpy as np
import paddle.nn.functional as F
from paddle.io import BatchSampler, Dataset, DataLoader
from paddlenlp.transformers import AutoTokenizer
from utils.get_data import (train_titles, val_titles)
from utils.constant import *
from utils.dataset import TextDataset, MergeDataset
from tqdm import tqdm
import pickle




In [3]:

class MergeDataset(Dataset):
    def __init__(self, data, model_names, max_seq_len=MAX_SEQ_LEN):
        super(MergeDataset, self).__init__()
        self.data = data
        self.simple_names = [x.split('/')[-1] for x in model_names]
        self.models = {name: paddle.jit.load(os.path.join('./models', name, name)) for name in self.simple_names}
        self.tokenizer = {name.split('/')[-1]: AutoTokenizer.from_pretrained(name) for name in model_names}
        self.max_seq_len = MAX_SEQ_LEN

    def __getitem__(self, index):
        text, label = self.data[index]
        merge_tensor = []
        for name, model in self.models.items():
            encode_text = self.tokenizer[name](text, max_seq_len=self.max_seq_len, pad_to_max_seq_len=True)
            input_ids, token_type_ids = paddle.to_tensor(encode_text['input_ids'], dtype='int64'), paddle.to_tensor(encode_text['token_type_ids'], dtype='int64')
            input_ids, token_type_ids = paddle.reshape(input_ids, (1, self.max_seq_len)), paddle.reshape(token_type_ids, (1, self.max_seq_len))
            logits = self.models[name](input_ids, token_type_ids)
            merge_tensor.append(paddle.flatten(logits))

        return tuple([paddle.concat(merge_tensor), np.array([label], dtype='int64')])
    
    def __len__(self):
        return len(self.data)


In [4]:
model_list = [
                    'hfl/rbt6',
                    'ernie-3.0-mini-zh',
                    'tinybert-4l-312d-zh',
                    'hfl/rbt3',
                ]

In [5]:
def get_tensor(model_name, train_titles, val_titles):
    print(f'Processing {model_name}...')
    train, val = [], []
    simple_name = model_name.split('/')[-1]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    train_dataset, val_dataset = TextDataset(train_titles, tokenizer), TextDataset(val_titles, tokenizer)
    
    train_batch_sampler = paddle.io.BatchSampler(train_dataset,
                                        shuffle=False,
                                        batch_size=BATCH_SIZE,
                                        )

    val_batch_sampler = paddle.io.BatchSampler(val_dataset,
                                            shuffle=False,
                                            batch_size=BATCH_SIZE,
                                            )

    model = paddle.jit.load(os.path.join('./models', simple_name, simple_name))

    # 定义数据加载器
    train_data_loader = paddle.io.DataLoader(dataset=train_dataset,
                                            batch_sampler=train_batch_sampler,
                                            return_list=True,
                                            num_workers=4,
                                            )
    val_data_loader = paddle.io.DataLoader(dataset=val_dataset,
                                            batch_sampler=val_batch_sampler,
                                            return_list=True,
                                            num_workers=4,
                                            )
    start_t = time.time()
    print('Processing train data:')
    for ind, item in enumerate(train_data_loader):
        if ind and (not ind%(len(train_data_loader)//10)):
            print(f'\t{ind}/{len(train_data_loader)} has finished, average time {(time.time()-start_t)/(ind + 1)}')
        input_ids, token_type_ids, labels = item
        logits = model(input_ids, token_type_ids)
        train.append((logits.numpy().tolist(), labels.numpy().tolist()))
        # if ind == 0:
        #     break
    print(f'Finished {len(train_data_loader)} items in {time.time() - start_t} seconds.' )

    print('Processing tval data')
    start_t = time.time()
    for ind, item in enumerate(val_data_loader):
        if ind and (not ind%(len(val_data_loader)//10)):
            print(f'\t{ind}/{len(val_data_loader)} has finished, average time {(time.time()-start_t)/(ind + 1)}')
        input_ids, token_type_ids, labels = item
        logits = model(input_ids, token_type_ids)
        val.append((logits.numpy().tolist(), labels.numpy().tolist()))
        # if ind == 0:
        #     break
    print(f'Finished {len(val_data_loader)} items in {time.time() - start_t} seconds.')

    with open(f"./np_data/{simple_name + '_train.pkl'}", 'wb') as pkl:
        pickle.dump(train, pkl)
    with open(f"./np_data/{simple_name + '_val.pkl'}", 'wb') as pkl:
        pickle.dump(val, pkl)

In [6]:
# for model_name in model_list:
#     get_tensor(model_name, train_titles, val_titles)

## 数据读取

In [7]:
train_pkl, val_pkl = [], []
for model_name in model_list:
    simple_name = model_name.split('/')[-1]
    path = os.path.join('./np_data', simple_name)
    with open(path + '_train.pkl', 'rb') as f:    
        train_pkl.append(pickle.load(f))
    with open(path + '_val.pkl', 'rb') as f:
        val_pkl.append(pickle.load(f))

In [8]:
num_train_batch, num_val_batch, num_models = len(train_pkl[0]), len(val_pkl[0]), len(model_list)
for i in range(num_train_batch):
    for j in range(num_models - 1):
        assert train_pkl[j][i][1] == train_pkl[j + 1][i][1]

for i in range(num_val_batch):
    for j in range(num_models - 1):
        assert train_pkl[j][i][1] == train_pkl[j + 1][i][1]

In [9]:
print(len(train_pkl[0]))
print(len(val_pkl[0]))

662
74


In [10]:
num_models, num_train_batch, num_val_batch = len(model_list), len(train_pkl[0]), len(val_pkl[0])

train_tensors, val_tensors = [], []

for i in range(num_train_batch):
    concat_train_tensors = [paddle.to_tensor(train_pkl[j][i][0], dtype='float32') for j in range(num_models)]
    train_tensors.append((paddle.concat(concat_train_tensors, axis=1), paddle.to_tensor(train_pkl[0][i][1], dtype='int64')) )

for i in range(num_val_batch):
    concat_val_tensors = [paddle.to_tensor(val_pkl[j][i][0], dtype='float32') for j in range(num_models)]
    val_tensors.append((paddle.concat(concat_val_tensors, axis=1), paddle.to_tensor(val_pkl[0][i][1], dtype='int64')))

W0430 12:54:35.151608   693 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0430 12:54:35.156440   693 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


## 检验训练张量和验证张量

In [11]:
for enu in range(5):
    print(f'案例{enu + 1}')
    print('标签:', paddle.flatten(train_tensors[2*enu][1]).numpy().tolist()[:20])
    print('模型预测:')
    for i in range(num_models):
        print(f'{model_list[i]}',  paddle.argmax(train_tensors[2*enu][0][:, 14*i:14*(i + 1)], axis=1).numpy().tolist()[:20])
    print()


案例1
标签: [3, 7, 10, 6, 0, 7, 0, 13, 6, 2, 11, 6, 3, 7, 6, 6, 6, 4, 3, 10]
模型预测:
hfl/rbt6 [6, 7, 10, 6, 0, 7, 0, 13, 6, 2, 11, 6, 3, 7, 12, 6, 6, 4, 3, 10]
ernie-3.0-mini-zh [9, 7, 10, 6, 0, 7, 0, 13, 6, 2, 11, 6, 3, 7, 12, 6, 6, 4, 3, 10]
tinybert-4l-312d-zh [9, 7, 10, 6, 0, 7, 0, 13, 6, 2, 8, 6, 3, 7, 6, 6, 6, 4, 3, 10]
hfl/rbt3 [6, 7, 10, 6, 0, 7, 0, 13, 6, 2, 11, 6, 3, 7, 6, 6, 6, 4, 3, 10]

案例2
标签: [9, 8, 4, 7, 13, 10, 10, 6, 0, 1, 9, 4, 3, 3, 9, 3, 3, 10, 5, 9]
模型预测:
hfl/rbt6 [9, 8, 4, 7, 13, 10, 10, 6, 0, 1, 9, 4, 3, 3, 9, 3, 3, 10, 5, 9]
ernie-3.0-mini-zh [9, 8, 4, 7, 13, 10, 10, 6, 0, 1, 9, 4, 3, 3, 9, 3, 3, 10, 5, 9]
tinybert-4l-312d-zh [9, 8, 4, 7, 13, 10, 10, 6, 0, 10, 9, 4, 3, 3, 9, 3, 3, 10, 5, 9]
hfl/rbt3 [9, 8, 4, 7, 13, 10, 10, 6, 0, 1, 9, 4, 3, 3, 9, 3, 3, 10, 5, 9]

案例3
标签: [6, 7, 3, 6, 7, 10, 3, 3, 13, 6, 10, 10, 7, 13, 13, 12, 6, 10, 2, 5]
模型预测:
hfl/rbt6 [6, 7, 9, 6, 7, 10, 3, 3, 13, 6, 10, 10, 7, 13, 13, 12, 6, 10, 2, 5]
ernie-3.0-mini-zh [6, 7, 

In [12]:
for enu in range(5):
    print(f'案例{enu + 1}')
    print('标签:', paddle.flatten(val_tensors[2*enu][1]).numpy().tolist()[:20])
    print('模型预测:')
    for i in range(num_models):
        print(f'\t{model_list[i]}:', paddle.argmax(val_tensors[2*enu][0][:, 14*i:14*(i + 1)], axis=1).numpy().tolist()[:20])
    print()


案例1
标签: [6, 10, 13, 9, 7, 6, 10, 10, 2, 10, 13, 13, 13, 10, 6, 10, 3, 5, 11, 6]
模型预测:
	hfl/rbt6: [6, 10, 13, 9, 7, 6, 10, 10, 2, 1, 13, 13, 13, 10, 6, 10, 3, 5, 11, 6]
	ernie-3.0-mini-zh: [6, 10, 13, 9, 7, 6, 10, 10, 2, 10, 13, 13, 13, 10, 6, 10, 3, 5, 11, 6]
	tinybert-4l-312d-zh: [6, 10, 13, 9, 7, 6, 10, 10, 2, 10, 13, 13, 13, 10, 6, 10, 3, 5, 8, 6]
	hfl/rbt3: [6, 10, 13, 9, 7, 6, 10, 3, 2, 10, 13, 13, 13, 10, 6, 10, 3, 5, 11, 6]

案例2
标签: [7, 3, 10, 3, 6, 3, 13, 12, 10, 3, 6, 13, 10, 4, 6, 3, 3, 3, 10, 4]
模型预测:
	hfl/rbt6: [9, 3, 10, 3, 6, 3, 13, 12, 10, 3, 6, 13, 10, 8, 6, 3, 3, 3, 10, 4]
	ernie-3.0-mini-zh: [9, 3, 10, 3, 6, 3, 13, 12, 10, 3, 6, 13, 10, 4, 6, 3, 3, 3, 10, 4]
	tinybert-4l-312d-zh: [9, 3, 10, 3, 6, 3, 13, 12, 10, 3, 6, 13, 10, 8, 6, 3, 3, 3, 10, 4]
	hfl/rbt3: [9, 3, 10, 3, 6, 3, 13, 12, 10, 3, 6, 13, 10, 4, 6, 3, 3, 3, 10, 4]

案例3
标签: [6, 13, 6, 13, 3, 6, 7, 12, 9, 6, 7, 6, 13, 0, 10, 7, 13, 6, 6, 10]
模型预测:
	hfl/rbt6: [6, 13, 6, 4, 3, 6, 7, 12, 9, 6, 

## 模型定义

In [13]:
class LinerMergedModel(nn.Layer):
    def __init__(self, num_models, num_classes=14):
        super(LinerMergedModel, self).__init__()
        self.linear_1 = nn.Linear(num_models*num_classes, num_classes, bias_attr=False)
        
    def forward(self, x):
        return self.linear_1(x)

class  ParameterMergedModel(nn.Layer):
    def __init__(self, num_models, num_classes=14):
        super(ParameterMergedModel, self).__init__()
        self.weights = paddle.create_parameter(shape=[num_models],
                                              dtype='float64',
                                              default_initializer=nn.initializer.Constant(1/num_models)
                                                )
        self.weights.stop_gradient = False
        self.num_models = num_models
        self.num_classes = num_classes

    def forward(self, x):
        probs = F.softmax(self.weights)
        res = probs[0]*x[:, :self.num_classes]
        for model_ind in range(1, self.num_models):
            res += probs[model_ind]*x[:, self.num_classes*model_ind: self.num_classes*(model_ind + 1)]

        return res


In [14]:
def evaluate(model, criterion, metric, data_loader):
    """
    Given a dataset, it evals model and computes the metric.

    Args:
        model(obj:`paddle.nn.Layer`): A model to classify texts.
        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
        metric(obj:`paddle.metric.Metric`): The evaluation metric.
    """
    model.eval()
    metric.reset()
    losses = []
    for input_x, input_label  in tqdm(data_loader):
        logits = model(input_x)
        loss = criterion(logits, input_label)
        losses.append(loss.numpy())
        correct = metric.compute(logits, input_label)
        metric.update(correct)
    accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.7f" % (np.mean(losses), accu))
    model.train()
    metric.reset()
    return accu

In [15]:
def main_train(merge_type:str):
    if merge_type == 'ParameterMergedModel':
        model = paddle.jit.to_static(ParameterMergedModel(4))
    else:
        model = paddle.jit.to_static(LinerMergedModel(4))

    # 定义优化器、损失函数和Acc计算器
    optimizer = paddle.optimizer.Adam(learning_rate=5e-3,
                                parameters=model.parameters(),
                                )
    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    # 调整至训练模式
    model.train() 
    best_acc = 0.00
    print(f'The training process of {merge_type} model is begining!')
    for epoch in range(5):
        print(f"epoch: {epoch + 1}, {time.ctime()}")
        start_t = time.time()
        metric.reset()
        for ind, item in enumerate(train_tensors):
            if ind and (not ind%300):
                accu = evaluate(model, criterion, metric, val_tensors)
                if accu > best_acc:
                    best_acc = accu
                    print('\t Best Acc: {:.6f}'.format(accu))
            input_x, input_label = item
            logits = model(input_x)
            loss = criterion(logits, input_label)
            probs = F.softmax(logits, axis=1)

            correct = metric.compute(probs, input_label)
            batch_acc = metric.update(correct)
            acc = metric.accumulate()
            
            loss.backward()
            ave_t = (time.time() - start_t)/(ind + 1)
            extra_h = ave_t*(len(train_tensors) - ind + 1)/3600
            if ind and (not ind%200):
                print(f'\t step:{ind}/{len(train_tensors)},', 'average time: {:.4f},'.format(ave_t), 'loss: {:.6f}'.format(loss.numpy()[0]), 'Batch Acc:{:.9f}, Acc:{:.9f}'.format(batch_acc, acc))

            optimizer.step()
            optimizer.clear_grad()
    print('Last Eval:')
    evaluate(model, criterion, metric, val_tensors)
    print(f'The training of {merge_type} has Finished!\n')
    return model



In [16]:
liner_merge_model = main_train('liner')

The training process of liner model is begining!
epoch: 1, Sun Apr 30 12:54:38 2023
	 step:200/662, average time: 0.0065, loss: 0.168074 Batch Acc:0.956054688, Acc:0.886495258


100%|██████████| 74/74 [00:00<00:00, 835.15it/s]


eval loss: 0.18679, accu: 0.9424702
	 Best Acc: 0.942470
	 step:400/662, average time: 0.0042, loss: 0.175861 Batch Acc:0.945312500, Acc:0.949663521


100%|██████████| 74/74 [00:00<00:00, 983.55it/s]


eval loss: 0.16993, accu: 0.9470285
	 Best Acc: 0.947028
	 step:600/662, average time: 0.0034, loss: 0.157342 Batch Acc:0.951171875, Acc:0.951171875
epoch: 2, Sun Apr 30 12:54:40 2023
	 step:200/662, average time: 0.0014, loss: 0.131859 Batch Acc:0.960937500, Acc:0.954757463


100%|██████████| 74/74 [00:00<00:00, 973.44it/s]


eval loss: 0.16129, accu: 0.9497927
	 Best Acc: 0.949793
	 step:400/662, average time: 0.0016, loss: 0.150768 Batch Acc:0.952148438, Acc:0.956692837


100%|██████████| 74/74 [00:00<00:00, 976.74it/s]


eval loss: 0.16095, accu: 0.9494073
	 step:600/662, average time: 0.0017, loss: 0.150328 Batch Acc:0.957031250, Acc:0.957031250
epoch: 3, Sun Apr 30 12:54:41 2023
	 step:200/662, average time: 0.0014, loss: 0.128816 Batch Acc:0.961914062, Acc:0.956875777


100%|██████████| 74/74 [00:00<00:00, 981.35it/s]


eval loss: 0.15807, accu: 0.9505502
	 Best Acc: 0.950550
	 step:400/662, average time: 0.0016, loss: 0.145164 Batch Acc:0.953125000, Acc:0.958278543


100%|██████████| 74/74 [00:00<00:00, 954.05it/s]


eval loss: 0.15948, accu: 0.9497129
	 step:600/662, average time: 0.0017, loss: 0.148432 Batch Acc:0.957031250, Acc:0.957031250
epoch: 4, Sun Apr 30 12:54:43 2023
	 step:200/662, average time: 0.0014, loss: 0.129772 Batch Acc:0.958984375, Acc:0.957274176


100%|██████████| 74/74 [00:00<00:00, 961.33it/s]


eval loss: 0.15761, accu: 0.9508425
	 Best Acc: 0.950843
	 step:400/662, average time: 0.0016, loss: 0.143604 Batch Acc:0.952148438, Acc:0.958587949


100%|██████████| 74/74 [00:00<00:00, 959.72it/s]


eval loss: 0.15906, accu: 0.9499787
	 step:600/662, average time: 0.0017, loss: 0.147307 Batch Acc:0.956054688, Acc:0.956054688
epoch: 5, Sun Apr 30 12:54:44 2023
	 step:200/662, average time: 0.0014, loss: 0.130304 Batch Acc:0.959960938, Acc:0.957711443


100%|██████████| 74/74 [00:00<00:00, 951.13it/s]


eval loss: 0.15762, accu: 0.9509223
	 Best Acc: 0.950922
	 step:400/662, average time: 0.0016, loss: 0.143160 Batch Acc:0.954101562, Acc:0.958752321


100%|██████████| 74/74 [00:00<00:00, 949.52it/s]


eval loss: 0.15887, accu: 0.9500585
	 step:600/662, average time: 0.0017, loss: 0.146709 Batch Acc:0.956054688, Acc:0.956054688
Last Eval:


100%|██████████| 74/74 [00:00<00:00, 949.63it/s]


eval loss: 0.15820, accu: 0.9504970
The training of liner has Finished!



In [17]:
parameter_merge_model = main_train('ParameterMergedModel')

The training process of ParameterMergedModel model is begining!
epoch: 1, Sun Apr 30 12:54:45 2023
	 step:200/662, average time: 0.0034, loss: 0.134554 Batch Acc:0.959960938, Acc:0.952984103


100%|██████████| 74/74 [00:00<00:00, 387.62it/s]


eval loss: 0.16161, accu: 0.9494073
	 Best Acc: 0.949407
	 step:400/662, average time: 0.0035, loss: 0.149114 Batch Acc:0.955078125, Acc:0.956257735


100%|██████████| 74/74 [00:00<00:00, 565.76it/s]


eval loss: 0.16128, accu: 0.9492080
	 step:600/662, average time: 0.0034, loss: 0.148273 Batch Acc:0.958007812, Acc:0.958007812
epoch: 2, Sun Apr 30 12:54:47 2023
	 step:200/662, average time: 0.0025, loss: 0.131443 Batch Acc:0.962890625, Acc:0.956078980


100%|██████████| 74/74 [00:00<00:00, 557.34it/s]


eval loss: 0.16119, accu: 0.9490219
	 step:400/662, average time: 0.0029, loss: 0.147784 Batch Acc:0.956054688, Acc:0.956596148


100%|██████████| 74/74 [00:00<00:00, 557.28it/s]


eval loss: 0.16116, accu: 0.9489820
	 step:600/662, average time: 0.0030, loss: 0.148274 Batch Acc:0.959960938, Acc:0.959960938
epoch: 3, Sun Apr 30 12:54:49 2023
	 step:200/662, average time: 0.0025, loss: 0.130893 Batch Acc:0.959960938, Acc:0.956112990


100%|██████████| 74/74 [00:00<00:00, 570.28it/s]


eval loss: 0.16109, accu: 0.9489820
	 step:400/662, average time: 0.0029, loss: 0.147576 Batch Acc:0.956054688, Acc:0.956741182


100%|██████████| 74/74 [00:00<00:00, 562.43it/s]


eval loss: 0.16109, accu: 0.9489820
	 step:600/662, average time: 0.0030, loss: 0.148204 Batch Acc:0.958984375, Acc:0.958984375
epoch: 4, Sun Apr 30 12:54:51 2023
	 step:200/662, average time: 0.0025, loss: 0.130693 Batch Acc:0.960937500, Acc:0.956108131


100%|██████████| 74/74 [00:00<00:00, 568.11it/s]


eval loss: 0.16102, accu: 0.9489820
	 step:400/662, average time: 0.0029, loss: 0.147443 Batch Acc:0.956054688, Acc:0.956779858


100%|██████████| 74/74 [00:00<00:00, 568.30it/s]


eval loss: 0.16103, accu: 0.9489687
	 step:600/662, average time: 0.0030, loss: 0.148059 Batch Acc:0.958984375, Acc:0.958984375
epoch: 5, Sun Apr 30 12:54:53 2023
	 step:200/662, average time: 0.0025, loss: 0.130602 Batch Acc:0.960937500, Acc:0.956181009


100%|██████████| 74/74 [00:00<00:00, 562.72it/s]


eval loss: 0.16096, accu: 0.9490086
	 step:400/662, average time: 0.0029, loss: 0.147347 Batch Acc:0.956054688, Acc:0.956818533


100%|██████████| 74/74 [00:00<00:00, 574.02it/s]


eval loss: 0.16098, accu: 0.9489820
	 step:600/662, average time: 0.0030, loss: 0.147930 Batch Acc:0.958984375, Acc:0.958984375
Last Eval:


100%|██████████| 74/74 [00:00<00:00, 555.54it/s]


eval loss: 0.16098, accu: 0.9489820
The training of ParameterMergedModel has Finished!



In [21]:
F.softmax(parameter_merge_model.weights)

Tensor(shape=[4], dtype=float64, place=Place(gpu:0), stop_gradient=False,
       [0.78959070, 0.05195850, 0.00508714, 0.15336366])

In [22]:
liner_merge_model.linear_1.weight

Parameter containing:
Tensor(shape=[56, 14], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [[ 0.59488767,  0.03648676, -0.06393395, -0.14288300, -0.33817297,
         -0.27167651, -0.18004082, -0.16317739, -0.11895706, -0.14798854,
         -0.35454124,  0.21112874, -0.11768337, -0.36149958],
        [ 0.11847109,  0.49146467, -0.04506579, -0.00288577, -0.37033501,
         -0.07243355, -0.49628955, -0.16152610, -0.12444752, -0.07666799,
          0.03943317,  0.08501796, -0.22640923, -0.53965122],
        [-0.03175763, -0.02949208,  0.64564723, -0.24506234, -0.36744472,
         -0.15144211, -0.22354050, -0.35534158, -0.48441312, -0.32396862,
         -0.44995800, -0.41254640, -0.29896140, -0.30043277],
        [-0.23840371,  0.06942926, -0.39213532,  0.47911954, -0.45395631,
         -0.40394390, -0.63441437, -0.52534240, -0.44868895, -0.43480897,
         -0.43540955, -0.26575932, -0.65916765, -0.49560958],
        [-0.00624943, -0.30467880, -0.04648293, -0.15803492