# 1.导入需要的工具包

In [13]:
import numpy as np
import pandas as pd
import random
import json, time 
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertConfig, AdamW, get_cosine_schedule_with_warmup
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fef99194f50>

# 2.载入预训练模型

In [2]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)  # 初始化分词器

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

# 3.数据预处理

**数据示例**    
may be more genial than ingenious , but it gets the job done—>1  
用制表符 '\t' 分割文本和标签 
 
**利用分词器进行编码**  
encode仅返回input_ids
```py
print(tokenizer.encode('我不喜欢你'))                    #[101, 2769, 679, 1599, 3614, 872, 102]
```
encode_plus返回所有编码信息
- input_ids：是单词在词典中的编码
- token_type_ids：区分两个句子的编码（上句全为0，下句全为1）
- attention_mask：指定对哪些词进行self-Attention操作

```py
sen_code = tokenizer.encode_plus('我不喜欢这世界','我只喜欢你')
print(sen_code)
# {
#   'input_ids': [101, 2769, 679, 1599, 3614, 6821, 686, 4518, 102, 2769, 1372, 1599, 3614, 872, 102], 
#   'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 
#   'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# }
```


In [5]:
input_ids, input_masks  = [], []  # input char ids, attention mask
labels = []      # 标签
maxlen = 66      
 
df = pd.read_csv('./train.tsv', delimiter='\t', header=None)
for index, sentence in enumerate(df[0]):
    # encode_plus会输出一个字典，分别为'input_ids', 'token_type_ids', 'attention_mask'对应的编码
    # 根据参数会短则补齐，长则切断
    encode_dict = tokenizer.encode_plus(text=sentence,
                                        max_length=maxlen, 
                                        padding='max_length', 
                                        truncation=True)
    
    input_ids.append(encode_dict['input_ids'])
    input_masks.append(encode_dict['attention_mask'])

labels = list(df[1])

input_ids, input_masks = np.array(input_ids), np.array(input_masks)
labels = np.array(labels)
print(input_ids.shape, input_masks.shape, labels.shape)

(6920, 66) (6920, 66) (6920,)


# 4. 切分训练集，验证集，测试集

In [6]:
# 随机打乱索引
idxes = np.arange(input_ids.shape[0])
np.random.shuffle(idxes)
print(idxes.shape, idxes[:10])


# 8:1:1 划分训练集、验证集、测试集
input_ids_train, input_ids_valid, input_ids_test = input_ids[idxes[:5536]], input_ids[idxes[5536:6226]], input_ids[idxes[6226:]]
input_masks_train, input_masks_valid, input_masks_test = input_masks[idxes[:5536]], input_masks[idxes[5536:6226]], input_masks[idxes[6226:]] 

y_train, y_valid, y_test = labels[idxes[:5536]], labels[idxes[5536:6226]], labels[idxes[6226:]]

print(input_ids_train.shape, y_train.shape, input_ids_valid.shape, y_valid.shape, 
      input_ids_test.shape, y_test.shape)

(6920,) [1166 3851 2611 4407 6763 1024 4097 5390 3909 3041]
(5536, 66) (5536,) (690, 66) (690,) (694, 66) (694,)


# 5. 加载到PyTorch的DataLoader

In [33]:
BATCH_SIZE = 64  # 如果会出现OOM问题，减小它
# 训练集
# TensorDataset 可以用来对tensor进行打包。
train_data = TensorDataset( torch.LongTensor(input_ids_train), 
                            torch.LongTensor(input_masks_train), 
                            torch.LongTensor(y_train))
train_sampler = RandomSampler(train_data)  
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
# 验证集
valid_data = TensorDataset( torch.LongTensor(input_ids_valid), 
                            torch.LongTensor(input_masks_valid),
                            torch.LongTensor(y_valid))
valid_sampler = SequentialSampler(valid_data)
valid_loader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# 测试集（是没有标签的）
test_data = TensorDataset( torch.LongTensor(input_ids_test), 
                           torch.LongTensor(input_masks_test))
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

# 6. 定义BERT模型

In [49]:
# 定义model
class Bert_Model(nn.Module):
    def __init__(self, bert_path, classes=2):
        super(Bert_Model, self).__init__()
        self.config = DistilBertConfig.from_pretrained(bert_path)
        self.bert = model_class.from_pretrained(bert_path)       # 加载预训练模型权重
        self.fc = nn.Linear(self.config.hidden_size, classes)    # 直接分类
        
    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask) # [batchsize, len, 768]
        out_pool = outputs[0][:,0,:]       # 池化后的输出 [bs, config.hidden_size]
        logit = self.fc(out_pool)   # [bs, classes]
        return logit

# 7. 实例化BERT模型

In [50]:
def get_parameter_number(model):
    #  打印模型参数量
    total_num = sum(p.numel() for p in model.parameters())
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return 'Total parameters: {}, Trainable parameters: {}'.format(total_num, trainable_num)


# set cuda
gpu = 1
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    DEVICE = torch.device("cuda", gpu)
else:
    DEVICE = torch.device("cpu")
EPOCHS = 5
model = Bert_Model(pretrained_weights).to(DEVICE)
print(get_parameter_number(model))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total parameters: 66364418, Trainable parameters: 66364418


# 8. 定义优化器

In [51]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4) #AdamW优化器
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=len(train_loader),
                                            num_training_steps=EPOCHS*len(train_loader))
# 学习率先线性warmup一个epoch，然后cosine式下降。
# 这里给个小提示，一定要加warmup（学习率从0慢慢升上去），要不然你把它去掉试试，基本上收敛不了。

# 9. 定义训练函数和测试验证函数

In [54]:
# 评估模型性能，在验证集上
def evaluate(model, data_loader, device):
    model.eval()
    val_true, val_pred = [], []
    with torch.no_grad():
        for idx, (ids, att, y) in (enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(y.squeeze().cpu().numpy().tolist())
    
    return accuracy_score(val_true, val_pred)  #返回accuracy


# 测试集没有标签，需要预测提交
def predict(model, data_loader, device):
    model.eval()
    val_pred = []
    with torch.no_grad():
        for idx, (ids, att) in tqdm(enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
    return val_pred


def train_and_eval(model, train_loader, valid_loader, 
                   optimizer, scheduler, device, epoch):
    best_acc = 0.0
    patience = 0
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):
        """训练模型"""
        start = time.time()
        model.train()
        print("***** Running training epoch {} *****".format(i+1))
        train_loss_sum = 0.0
        for idx, (ids, att, y) in enumerate(train_loader):
            ids, att, y = ids.to(device), att.to(device), y.to(device)  
            y_pred = model(ids, att)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()   # 学习率变化
            
            train_loss_sum += loss.item()
            if (idx + 1) % (len(train_loader)//5) == 0:    # 只打印五次结果
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f}".format(
                          i+1, idx+1, len(train_loader), train_loss_sum/(idx+1), time.time() - start))
                # print("Learning rate = {}".format(optimizer.state_dict()['param_groups'][0]['lr']))

        """验证模型"""
        model.eval()
        acc = evaluate(model, valid_loader, device)  # 验证模型的性能
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_bert_model.pth") 
        
        print("current acc is {:.4f}, best acc is {:.4f}".format(acc, best_acc))
        print("time costed = {}s \n".format(round(time.time() - start, 5)))

# 10. 开始训练和验证模型

In [55]:
# 训练和验证评估
train_and_eval(model, train_loader, valid_loader, optimizer, scheduler, DEVICE, EPOCHS)

***** Running training epoch 1 *****
Epoch 0001 | Step 0017/0087 | Loss 0.5764 | Time 3.0348
Epoch 0001 | Step 0034/0087 | Loss 0.4900 | Time 6.0399
Epoch 0001 | Step 0051/0087 | Loss 0.4443 | Time 9.0379
Epoch 0001 | Step 0068/0087 | Loss 0.4073 | Time 12.0426
Epoch 0001 | Step 0085/0087 | Loss 0.3860 | Time 15.0543
current acc is 0.8594, best acc is 0.8594
time costed = 17.07684s 

***** Running training epoch 2 *****
Epoch 0002 | Step 0017/0087 | Loss 0.2220 | Time 3.0130
Epoch 0002 | Step 0034/0087 | Loss 0.2146 | Time 6.0488
Epoch 0002 | Step 0051/0087 | Loss 0.2033 | Time 9.0883
Epoch 0002 | Step 0068/0087 | Loss 0.1980 | Time 12.1371
Epoch 0002 | Step 0085/0087 | Loss 0.1969 | Time 15.1804
current acc is 0.8812, best acc is 0.8812
time costed = 17.175s 

***** Running training epoch 3 *****
Epoch 0003 | Step 0017/0087 | Loss 0.1079 | Time 3.0570
Epoch 0003 | Step 0034/0087 | Loss 0.1083 | Time 6.1206
Epoch 0003 | Step 0051/0087 | Loss 0.1043 | Time 9.1950
Epoch 0003 | Step 0068/

# 11. 加载最优模型测试

In [56]:
# 加载最优权重对测试集测试
model.load_state_dict(torch.load("best_bert_model.pth"))
pred_test = predict(model, test_loader, DEVICE)
print("\n Test Accuracy = {} \n".format(accuracy_score(y_test, pred_test)))
print(classification_report(y_test, pred_test, digits=4))

11it [00:00, 16.65it/s]


 Test Accuracy = 0.8818443804034583 

              precision    recall  f1-score   support

           0     0.8728    0.8882    0.8805       340
           1     0.8908    0.8757    0.8832       354

    accuracy                         0.8818       694
   macro avg     0.8818    0.8820    0.8818       694
weighted avg     0.8820    0.8818    0.8819       694




