# 1 数据预处理


In [1]:
import paddle
import paddlenlp as ppnlp
from paddlenlp.metrics import ChunkEvaluator
from functools import partial
from paddlenlp.data import Stack, Pad, Tuple

## 1.1 加载自定义数据集

In [2]:
from paddlenlp.datasets import load_dataset
import pandas as pd
import re

def read(data_path,is_test = False):
    if not is_test:
        data = pd.read_csv(data_path,encoding="gb18030")
        indexs = data.loc[data.label.isin([25,26,27,28,29,30,33])].index.tolist()
        data.drop(index=indexs,inplace=True)
        for index,row in data.iterrows():
            sentence = row[1].split('。')[-6:]
            sentence = '。'.join(sentence)
            # print(sentence)
            labels = row[2]
            yield {'text': sentence, 'label': labels}
    else:
        data = pd.read_csv(data_path,skiprows=40000,nrows=10000,encoding="gb18030")
        for index,row in data.iterrows():
            sentence = row[1]
            yield {'text': sentence}

train_ds = load_dataset(read, data_path="./work/BDCI/train.csv",is_test = False,lazy=False)
# test_ds = load_dataset(read, data_path="./work/BDCI/train.csv",is_test = True,lazy=False) 

## 1.2 定义数据转换函数，实现文字编码

In [3]:
def convert_example(example, tokenizer, max_seq_length=256,is_test=False):
    encoded_inputs = tokenizer(example["text"], max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    if not is_test:
        label = [example["label"]]
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

## 1.3 构造偏函数，将单条数据进行转换
此处需先定义tokenizer，再定义偏函数

In [10]:
batch_size = 56
max_seq_length = 312
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

[2022-10-29 14:06:28,985] [    INFO] - Found /home/aistudio/.paddlenlp/models/ernie-1.0/vocab.txt


In [5]:
print(convert_example(train_ds[100],tokenizer=tokenizer,max_seq_length=128,is_test=False))
print(convert_example(test_ds[100],tokenizer=tokenizer,max_seq_length=128,is_test=True))

([1, 1073, 1169, 68, 2201, 4, 654, 4, 2670, 17, 5010, 136, 5010, 139, 39, 21, 4, 2746, 495, 4, 644, 219, 244, 484, 904, 308, 8, 4, 590, 12, 68, 73, 4, 87, 11, 644, 219, 244, 3376, 181, 488, 1769, 231, 1342, 4, 22, 171, 612, 8, 68, 2201, 1169, 1860, 2785, 1073, 4, 1079, 239, 9, 195, 1510, 1342, 681, 17, 627, 27, 136, 4, 145, 239, 1485, 192, 8, 119, 1387, 4225, 183, 12043, 803, 1079, 49, 4, 171, 612, 8, 16, 231, 4, 160, 39, 28, 1005, 12043, 1751, 181, 99, 663, 1631, 121, 8, 119, 594, 788, 245, 160, 39, 730, 1005, 12043, 644, 219, 244, 1751, 181, 99, 12, 222, 8, 119, 72, 245, 37, 1541, 17, 5010, 136, 5010, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],

## 1.4 构造batchify_fn，在batch数据构造时进行padding

In [11]:
batchify_fn_train = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    Stack(dtype="int64")  # label
): [data for data in fn(samples)]
batchify_fn_test = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
): [data for data in fn(samples)]

## 1.5 构造dataloader

In [12]:
trans_func = partial(
    convert_example,
    tokenizer = tokenizer,    
    max_seq_length = max_seq_length,
    is_test = False)

train_ds = train_ds.map(trans_func)
# test_ds = test_ds.map(trans_func)



In [13]:
train_loader = paddle.io.DataLoader(
        dataset=train_ds,
        batch_size=56,
        collate_fn=batchify_fn_train,
        shuffle=True,
        return_list=True)
     
# test_loader = paddle.io.DataLoader(
#         dataset=test_ds,
#         batch_size=48,
#         shuffle=False,
#         collate_fn=batchify_fn_test,
#         return_list=True)

# 2.定义分类模型

In [14]:
model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=25)

[2022-10-29 14:06:49,106] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams


# 3 训练模型

In [15]:
class MyLSTM_Attention(paddle.nn.Layer):
    def __init__(self):
        super(MyLSTM_Attention, self).__init__()
        # self.embedding = nn.Embedding(vocab_size, 312)
        self.lstm = nn.LSTM(64, 25, num_layers=2, direction='bidirect',dropout=0.5)
        self.attention = nn.MultiHeadAttention(embed_dim=25*2,num_heads=2,dropout=0.2)   #embed_dim要能被num_heads整除
        self.linear = nn.Linear(in_features=25*2, out_features=len(label_idx))
        self.dropout = nn.Dropout(0.5)

    

    def forward(self, inputs):
        emb = self.dropout(inputs)
        #output形状大小为[batch_size,time_steps,num_directions * hidden_size]
        #h和c的形状大小为[num_layers * num_directions, batch_size, hidden_size]
        output, (h,c) = self.lstm(emb)
        att = self.attention(output)
        #attention输入与输出tensor shape相同
        x = paddle.mean(att,axis = 1)    
        #x形状大小为[batch_size, hidden_size * num_directions]
        x = self.dropout(x)
        return self.linear(x)

In [16]:
optimizer = paddle.optimizer.AdamW(learning_rate=1e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
# criterion = paddle.nn.loss.BCELoss()

metric = paddle.metric.Accuracy()
epochs = 20


In [19]:
import paddle.nn.functional as F
import gc
import numpy as np

def train(model,train_loader):
    global_step = 0
    total_acc = []
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_loader, start=1):
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            # print(logits.shape)
            logits = MyLSTM_Attention(logits)
            loss = criterion(logits, labels)
            # probs = F.softmax(logits, axis=1)
            # correct = metric.compute(probs, labels)
            # metric.update(correct)
            # acc = metric.accumulate()
            acc = paddle.metric.accuracy(logits, label)
            total_acc.append(acc.numpy()[0])
            global_step += 1
            
            if global_step % 200 == 0 :
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, np.sum(acc)))
            # 反向梯度回传，更新参数
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
    model.save_pretrained('/home/aistudio/checkpoint3')
    tokenizer.save_pretrained('/home/aistudio/checkpoint3')
    del model
    del tokenizer
    gc.collect()

In [20]:
train(model,train_loader)

SystemError: (Fatal) Operator elementwise_add raises an paddle::memory::allocation::BadAlloc exception.
The exception content is
:ResourceExhaustedError: 

Out of memory error on GPU 0. Cannot allocate 204.750244MB memory on GPU 0, 31.728271GB memory has been allocated and available memory is only 20.750000MB.

Please check whether there is any other process using GPU 0.
1. If yes, please stop them, or start PaddlePaddle on another GPU.
2. If no, please decrease the batch size of your model. 

 (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:79)
. (at /paddle/paddle/fluid/imperative/tracer.cc:221)


# 4 模型预测

## 4.1 加载模型

In [16]:
model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=25)
model_dict = paddle.load('/home/aistudio/checkpoint2/model_state.pdparams')
model.set_dict(model_dict)

[2022-10-27 11:39:48,924] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams


## 4.2 在测试集上预测

In [28]:
# label_map = dict(zip(label_idx.values(),label_idx.keys()))
# model.eval()
# predictions = []
# for input_ids, segment_ids in test_loader:
#     logits = model(input_ids, segment_ids)
#     probs = F.softmax(logits, axis=1)  
#     idx = paddle.argmax(probs, axis=1).numpy()
#     idx = idx.tolist()
#     labels = [label_map[i] for i in idx]
#     predictions.extend(labels)

In [32]:
# predictions[:10]

['房产', '时政', '科技', '股票', '股票', '房产', '时政', '体育', '科技', '家居']

In [17]:
def pre_read(data_path):
    data = pd.read_csv(data_path,encoding="utf-8")
    for index,row in data.iterrows():
        string =re.sub(r'[xx月日]+','',row[1])
        yield {'text': string}

pre_func = partial(
    convert_example,
    tokenizer = tokenizer,    
    max_seq_length = max_seq_length,
    is_test = True)


pre_loader = load_dataset(pre_read, data_path="./work/BDCI/testA.csv",lazy=False) 
pre_ds = pre_loader.map(pre_func)       
pre_loader = paddle.io.DataLoader(
        dataset=pre_ds,
        batch_size=48,
        shuffle=False,
        collate_fn=batchify_fn_test,
        return_list=True)
model.eval()
predictions = []
for input_ids, segment_ids in pre_loader:
    logits = model(input_ids, segment_ids)
    probs = F.softmax(logits, axis=1)  
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    labels = [i for i in idx]
    predictions.extend(labels)

In [23]:
# data = pd.read_csv("./work/BDCI/testA.csv",usecols=["id"],encoding="utf-8")
df = pd.DataFrame(data={"id":data["id"],"label":predictions})
df.to_csv("submission.csv",index=False)

In [22]:
data

Unnamed: 0,id
0,id_50000
1,id_50001
2,id_50002
3,id_50003
4,id_50004
...,...
24996,id_74996
24997,id_74997
24998,id_74998
24999,id_74999
