In [2]:
import torch
from datasets import load_dataset,load_from_disk
from d2l import torch as d2l
import re
import jieba
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

导入本地 csv 文件,得到一个只含 train Dataset 的数据集

In [3]:
wb_data = load_dataset("csv",data_files='../data/weibo_senti_100k.csv')

Found cached dataset csv (C:/Users/sjj/.cache/huggingface/datasets/csv/default-a6fb0b0e0bdd94cc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
wb_data['train']

Dataset({
    features: ['label', 'review'],
    num_rows: 119988
})

### 0. 清洗数据
1. DataDict 对象调用 map() 时,不区分 train/test/validation 的 DataSet
2. 试图开批量处理（设 batched=True）时调用 `re` 模块会报错，因为此时不允许回调函数 f 接收并返回`Dict[str, Any]`,而是`Dict[str, List]`,而 `re` 模块只能处理 `str` 而非 `list`。而在`Python`代码中直接写 for Loop 处理 `list` 效率不高
>见（https://huggingface.co/docs/datasets/v2.13.1/en/package_reference/main_classes#datasets.DatasetDict.map.function）

In [4]:
def f(d):
    review = d['review']

    # 清洗数据
    # 删除‘//@用户名:’
    pattern_1 = re.compile(r'//@.*?:')
    s1 = re.sub(pattern_1, '', review)

    # 删除‘@用户名 ’
    pattern_2 = re.compile('@.*?\s')
    s2 = re.sub(pattern_2,'', s1)

    # 删除‘@用户名:’
    pattern_3 = re.compile('@.*?:')
    s3 = re.sub(pattern_3,'', s2)

    # 删除‘@用户名,’
    pattern_3 = re.compile('@.*?,')
    s4 = re.sub(pattern_3,'', s3)

    # 删除‘@用户名。’
    pattern_3 = re.compile('@.*?。')
    s5 = re.sub(pattern_3,'', s4)


    d['review'] = s5
    return d

In [5]:
wb_data_map = wb_data.map(function=f)

wb_data_map,wb_data_map['train']['review'][:20]

Loading cached processed dataset at C:\Users\sjj\.cache\huggingface\datasets\csv\default-a6fb0b0e0bdd94cc\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-7ead21f1d71bab64.arrow


(DatasetDict({
     train: Dataset({
         features: ['label', 'review'],
         num_rows: 119988
     })
 }),
 ['\ufeff更博了，爆照了，帅的呀，就是越来越爱你！生快傻缺[爱你][爱你][爱你]',
  '土耳其的事要认真对待[哈哈]，否则直接开除。很是细心，酒店都全部OK啦。',
  '姑娘都羡慕你呢…还有招财猫高兴……[哈哈]小学徒一枚，等着明天见您呢大佬范儿[书呆子]',
  '美~~~~~[爱你]',
  '梦想有多大，舞台就有多大![鼓掌]',
  '[花心][鼓掌] [春暖花开]',
  '某问答社区上收到一大学生发给我的私信：“偶喜欢阿姨！偶是阿姨控！”我回他：“阿姨稀饭小盆友！偶是小盆友控！” [哈哈]',
  '吃货们无不啧啧称奇，好不喜欢！PS:写错一个字！[哈哈]@陈小kitty猫@游子的歌@solo在厦门',
  '#Sweet Morning#From now on,love yourself,enjoy living then smile.从现在开始，爱自己，享受生活并且微笑。[呵呵] [嘻嘻] [哈哈] [挤眼] [太开心] 早安、甜心们',
  '【霍思燕剖腹产下“小江江” 老公落泪】今晨9时霍思燕产下一名男婴，宝宝重8斤3两，母子平安。杜江的脸上洋溢着做爸爸的欣喜：宝宝小名叫“小江江”，眼睛像他，鼻子和嘴巴则像霍思燕，看到宝贝就忍不住落泪！恭喜，祝福“小江江”在爱里健康地成长[爱你]...http://t.cn/z8EwSPU',
  '[鼓掌] 一流的经纪公司是超模的摇篮！[鼓掌] 东方宾利强大的名模军团！',
  '真好//[害羞]',
  '第一次见到有花瓣的面膜，一片抵普通面膜好几片 [哈哈]！补水神器啊，一帖见效！ 睡前一片，15分钟超神奇膜法，第二天起来你会发现你脸又白又嫩还有光泽，持续几天皮肤好像剥了壳的鸡蛋一样白白嫩嫩的[太开心]！ 明星推荐，美妆老师私藏的神奇"膜"法！8片礼盒装抢购地址>>>（去评论中找链接哦）',
  '好感动[亲亲]大家都陆陆续续收到超极本尼泊尔的奖品了，没想到你还带着去看瓷房子~祝蜜月快乐哦',
  '[雪]大象感觉好冷喔。。。  大象放冰箱分三步，绑定手机也分三步

原数据集前半是积极label,后半是消极label,需要打乱

In [6]:
mapped_data = wb_data_map.shuffle(seed=42)

Loading cached shuffled indices for dataset at C:\Users\sjj\.cache\huggingface\datasets\csv\default-a6fb0b0e0bdd94cc\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-1d339c3751a312bc.arrow


在 train Dataset 内部再分割出 train set / test set

In [7]:
data = mapped_data['train'].train_test_split(test_size=0.25)
data

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 89991
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 29997
    })
})

### 1.加载数据集

In [8]:
# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self,split):
        # 导入本地 csv 文件,得到一个只含 train Dataset 的数据集
        wb_data = load_dataset("csv",data_files='../data/weibo_senti_100k.csv')
        #
        total_data =  wb_data['train'].train_test_split(test_size=0.25)
        # 选取训练集或测试集，并打乱顺序
        self.dataset = data[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        review = self.dataset[i]['review']
        label = self.dataset[i]['label']

        return label,review


train_dataset = Dataset('train')
test_dataset = Dataset('test')


len(train_dataset), train_dataset[0],len(test_dataset), test_dataset[0]

Found cached dataset csv (C:/Users/sjj/.cache/huggingface/datasets/csv/default-a6fb0b0e0bdd94cc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (C:/Users/sjj/.cache/huggingface/datasets/csv/default-a6fb0b0e0bdd94cc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

(89991,
 (1,
  '卖萌才是终极归宿~~~ 桌面都是各种你，这个封面用了很久 回复[哈哈]恭喜哟~~请吃猫粮哟~~~ 请热烈、紧张、肉紧滴围观！感谢国家，感谢CCAV，感谢围脖儿，感谢俺滴所有粉丝，感谢所有同事，俺，上封面鸟！'),
 29997,
 (1, '[嘻嘻][嘻嘻]迷倒了'))

### 2.加载分词器
通常，一种模型对应一种特殊的 Tokenizer

In [10]:
from transformers import BertTokenizer

#加载字典和分词工具
token  = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

token

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

### 3.更新词典
词典要添加所有语句的 jieba 分词结果数组

#### （1）打印原有的 vocab，只有单字

In [14]:
dic = token.get_vocab()
type(dic), len(dic), '撒' in dic

(dict, 21128, True)

#### （2）函数，传入一个数据集及指定的列，返回对应去重的词典

In [15]:
def get_vocab(dataset):
    # 检查数据类型
    print('dataset.shape:\n',dataset.shape)
    reviews_list = dataset['train']['review']
    print(type(reviews_list))

    # 分词
    total_vocab=[]
    for i in range(len(reviews_list)):
        # total_vocab 保存每条句子的分词结果
        total_vocab += list(jieba.cut(reviews_list[i],cut_all=False))

    #total_vocab 去重
    return set(total_vocab)

In [22]:
start_time = time.time()  # 记录开始时间
vocab = get_vocab(mapped_data) # 传入划分数据集/测试集前的总集
end_time = time.time()  # 记录结束时间

run_time = end_time - start_time  # 计算运行时间
print('get_vocab：', run_time, '秒')

# 减去 预处理 Bert-Chinese 的tokenizer 的词表中已有的
vocab = vocab - set(token.vocab.keys())
print("set 相减操作 over")

# 去掉长度为 1 的所有字符（要么预处理Bert-Chinese 已经有了，要么就是一些emoji ）
vocab = list(vocab)
vocab =[token for token in vocab if len(token)>1]
print("去掉长度=1的单字操作 over")

dataset.shape:
 {'train': (119988, 2)}
<class 'list'>
get_vocab： 62.011430978775024 秒
set 相减操作 over
去掉长度=1的单字操作 over


将构造的新词典导入分词器
这篇文章可读：（https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/）

In [24]:
# 添加新词
token.add_tokens(new_tokens=vocab)
# 添加新符号
token.add_special_tokens({'eos_token':'[EOS]'})

d = token.get_vocab()

token.save_vocabulary('../data/weibo100k')

type(d), len(d), '出发' in d

(dict, 163958, True)

### 3.定义 DataLoader 与 batch 处理函数

> DataLoader 实例对象并不需要移动到 GPU 上，实际上需要移动的是 loader 产生的数据张量
> 1/3 定义了 collate_fn ,`.to(device)`应当写在 collate_fn 内

In [25]:
def collate_fn(data):
    labels = [i[0] for i in data]
    reviews = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=reviews,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=200,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)


#数据加载器
loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

5624


(torch.Size([16, 200]),
 torch.Size([16, 200]),
 torch.Size([16, 200]),
 tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0'))

### 加载 BERT 模型

2/3 嵌入的子模型创建了实例，也要搬到GPU

In [27]:
from transformers import BertModel

#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# add new, random embeddings for the new tokens（因为词表更新了，模型的embedding 也要更新）
# 如果不加这一行，会出现 CUDA 报错
pretrained.resize_token_embeddings(len(token))

# 并转移到 GPU ,否则报错
pretrained.to(device)
#不训练,不需要计算梯度（requires_grad_(False) 用来冻结参数）
for param in pretrained.parameters():
    param.requires_grad_(False)



# #模型试算
# output = pretrained(input_ids=input_ids,
#            attention_mask=attention_mask,
#            token_type_ids=token_type_ids)
#
# output.last_hidden_state.shape

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
torch.cuda.empty_cache()

### 定义下游任务模型
3/3 总体的 model 搬到GPU

In [29]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 仅新增一个全连接层
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            bert = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)
        # 仅对 <cls> 一个词元做全连接层输出
        out = self.fc(bert.last_hidden_state[:, 0])

        out = out.softmax(dim=1)

        return out


# 模型搬运到 GPU 上
model = Model()
model.to(device)
for p in model.parameters():
    print(p,p.device)

# # 数据搬运到 GPU 上
# model(input_ids=input_ids.to(device),
#       attention_mask=attention_mask.to(device),
#       token_type_ids=token_type_ids.to(device)).shape

Parameter containing:
tensor([[-0.0204,  0.0273, -0.0168,  ..., -0.0343, -0.0299,  0.0052],
        [ 0.0112, -0.0046,  0.0196,  ...,  0.0290,  0.0152,  0.0069]],
       device='cuda:0', requires_grad=True) cuda:0
Parameter containing:
tensor([ 0.0271, -0.0096], device='cuda:0', requires_grad=True) cuda:0


### 训练

In [30]:
from transformers import AdamW
# from torch.optim import AdamW
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss() # 已经自带 softmax

model.train()

# 检测是否正确移动到 GPU
# for name, param in model.named_parameters():
#     print(name, param.shape, param.device)


for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    input_ids_gpu = input_ids.to(device)
    attention_mask_gpu = attention_mask.to(device)
    token_type_ids_gpu = token_type_ids.to(device)
    labels_gpu = labels.to(device)


    out = model(input_ids=input_ids_gpu,
                attention_mask=attention_mask_gpu,
                token_type_ids=token_type_ids_gpu)

    # print(out.device)

    loss = criterion(out, labels_gpu)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels_gpu).sum().item() / len(labels_gpu)   # 用 item() 取值的精度更高

        print(i, loss.item(), accuracy)

    if i == 300:
        break



0 0.6870027780532837 0.5
5 0.6656901240348816 0.625
10 0.7013100981712341 0.4375
15 0.6791286468505859 0.75
20 0.6740584969520569 0.625
25 0.6870272159576416 0.625
30 0.6768355369567871 0.625
35 0.6688064336776733 0.5625
40 0.6764165759086609 0.5625
45 0.63028484582901 0.6875
50 0.6485089063644409 0.625
55 0.65523761510849 0.6875
60 0.6024040579795837 0.875
65 0.6281912922859192 0.6875
70 0.6185539960861206 0.75
75 0.6220785975456238 0.6875
80 0.5600926876068115 0.8125
85 0.6008599996566772 0.6875
90 0.5925366878509521 0.8125
95 0.6616859436035156 0.625
100 0.6690319180488586 0.5625
105 0.5845451354980469 0.875
110 0.6158508062362671 0.8125
115 0.5583750605583191 0.875
120 0.6233053803443909 0.75
125 0.5831074714660645 0.875
130 0.6018896698951721 0.75
135 0.607182502746582 0.6875
140 0.5506905913352966 0.875
145 0.5384426116943359 0.8125
150 0.6007279753684998 0.75
155 0.5899836421012878 0.6875
160 0.5548563003540039 0.75
165 0.638267457485199 0.6875
170 0.6301925778388977 0.625
175 0

### 测试

In [31]:
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,labels) \
            in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

Found cached dataset csv (C:/Users/sjj/.cache/huggingface/datasets/csv/default-a6fb0b0e0bdd94cc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

0
1
2
3
4
0.88125
