此处是一个二分类的任务，可拓展为 n 分类

### 1.加载数据集

In [33]:
import torch
from datasets import load_dataset,load_from_disk
from d2l import torch as d2l
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


#定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_from_disk("../data/ChnSentiCorp")[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')


len(dataset), dataset[0]

(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))

### 2.加载分词器
通常，一种模型对应一种特殊的 Tokenizer

In [34]:
from transformers import BertTokenizer

#加载字典和分词工具
token  = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

token

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8766caed-dc32-4035-973a-87ced6d4976b)')' thrown while requesting HEAD https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt


BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

### 3.定义 DataLoader 与 batch 处理函数

> DataLoader 实例对象并不需要移动到 GPU 上，实际上需要移动的是 loader 产生的数据张量
> 1/3 定义了 collate_fn ,`.to(device)`应当写在 collate_fn 内

In [66]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)


#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

600


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1], device='cuda:0'))

### 加载 BERT 模型

2/3 嵌入的子模型创建了实例，也要搬到GPU

In [68]:
from transformers import BertModel

#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')
# 并转移到 GPU ,否则报错
pretrained.to(device)
#不训练,不需要计算梯度（requires_grad_(False) 用来冻结参数）
for param in pretrained.parameters():
    param.requires_grad_(False)


# #模型试算
# output = pretrained(input_ids=input_ids,
#            attention_mask=attention_mask,
#            token_type_ids=token_type_ids)
#
# output.last_hidden_state.shape

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
torch.cuda.empty_cache()

### 定义下游任务模型
3/3 总体的 model 搬到GPU

In [72]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 仅新增一个全连接层
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            bert = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)
        # 仅对 <cls> 一个词元做全连接层输出
        out = self.fc(bert.last_hidden_state[:, 0])

        out = out.softmax(dim=1)

        return out


# 模型搬运到 GPU 上
model = Model()
model.to(device)
for p in model.parameters():
    print(p,p.device)

# # 数据搬运到 GPU 上
# model(input_ids=input_ids.to(device),
#       attention_mask=attention_mask.to(device),
#       token_type_ids=token_type_ids.to(device)).shape

Parameter containing:
tensor([[-0.0026, -0.0097,  0.0304,  ..., -0.0118, -0.0121, -0.0264],
        [-0.0080, -0.0025, -0.0097,  ..., -0.0214,  0.0331,  0.0350]],
       device='cuda:0', requires_grad=True) cuda:0
Parameter containing:
tensor([0.0340, 0.0155], device='cuda:0', requires_grad=True) cuda:0


### 训练

In [73]:
from transformers import AdamW
# from torch.optim import AdamW
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss() # 已经自带 softmax

model.train()

# 检测是否正确移动到 GPU
# for name, param in model.named_parameters():
#     print(name, param.shape, param.device)


for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    input_ids_gpu = input_ids.to(device)
    attention_mask_gpu = attention_mask.to(device)
    token_type_ids_gpu = token_type_ids.to(device)
    labels_gpu = labels.to(device)


    out = model(input_ids=input_ids_gpu,
                attention_mask=attention_mask_gpu,
                token_type_ids=token_type_ids_gpu)

    # print(out.device)

    loss = criterion(out, labels_gpu)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels_gpu).sum().item() / len(labels_gpu)   # 用 item() 取值的精度更高

        print(i, loss.item(), accuracy)

    if i == 300:
        break

0 0.7425124049186707 0.3125


KeyboardInterrupt: 

### 测试

In [None]:
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,labels) \
            in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()