In [10]:
import torch
from datasets import load_dataset
# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self,split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp',split=split)
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self,i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text,label
dataset = Dataset('train')
len(dataset),dataset[0]

Using custom data configuration default
Reusing dataset chn_senti_corp (/home/qilb/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))

In [12]:
from transformers import BertTokenizer
# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
token

PreTrainedTokenizer(name_or_path='./mybert-base-chinese', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [13]:
# 定义批处理函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs = sents,
                                    truncation = True,
                                    padding = 'max_length',
                                    max_length = 500,
                                    return_tensors = 'pt',
                                    return_length=True)
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    return input_ids,attention_mask,token_type_ids,labels

# 定义数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                    batch_size = 16,
                                    collate_fn = collate_fn,
                                    shuffle = True,
                                    drop_last = True)
for i,(input_ids,attention_mask,token_type_ids,labels) in enumerate(loader):
    break
print (len(loader))
input_ids.shape,attention_mask.shape,token_type_ids.shape,labels

600


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]))

In [20]:
from transformers import BertModel
#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练    不需要计算梯度
for param  in pretrained.parameters():
    param.requires_grad_(False)
#模型试算
out = pretrained(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
out.last_hidden_state.shape # torch.Size([16, 500, 768])
out.last_hidden_state[:,0].shape # torch.Size([16, 768])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 768])

In [7]:
#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768,2) # 全连接 单层2分类模型 
    def forward(self,input_ids,attention_mask,token_type_ids):
        out = pretrained(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        out = self.fc(out.last_hidden_state[:,0]) # 特征中第0个词  用来分类  
        out = out.softmax(dim=1)
        return out
model = Model()
model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids).shape        

torch.Size([16, 2])

In [None]:
from transformers import AdamW
# 训练
optimizer = AdamW(model.parameters(),lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for i, (input_ids,attention_mask,token_type_ids,labels) in enumerate(loader):
    out  = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
    loss = criterion(out,labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if i%5  == 0:
        out = out.argmax(dim = 1)
        accuracy = (out == labels).sum().item() / len(labels)
        print (i, loss.item(),accuracy)
    if i == 300:
        break