## 1. 准备工作

### 1.1 环境准备


In [1]:
#关闭安装的输出
# %%capture
# !pip install transformers
# !pip install datasets
# !pip install evaluate

### 1.2 函数库与gpu使用

In [2]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from transformers.trainer_utils import EvalPrediction
import torchtext
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# import evaluate

# 使用gpu进行训练
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
# gpu型号
!nvidia-smi


Using cuda device
Wed Mar 15 16:46:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------

## 2. 数据预处理

### 2.1 数据集导入

In [3]:

class Dataset(torch.utils.data.Dataset):
  def __init__(self, filepath):
    news_df = pd.read_csv(filepath, encoding='utf-8')
    texts = news_df.loc[:,'text'].values
    labels = news_df.loc[:,'class'].values
    
    le = LabelEncoder()
    
    self.X = texts
    self.y = le.fit_transform(labels)
  def __len__(self):
    return len(self.y)
  
  def __getitem__(self, i):
    text = self.X[i]
    label = self.y[i]
    return text, label


# train_df = pd.read_csv('/kaggle/input/thucnews-subset-dataset/pr_train.csv',encoding='utf-8')
# val_df = pd.read_csv('/kaggle/input/thucnews-subset-dataset/pr_val.csv',encoding='utf-8')
# test_df = pd.read_csv('/kaggle/input/thucnews-subset-dataset/pr_test.csv',encoding='utf-8')
# train_df.head()

train_data = Dataset('/kaggle/input/thucnews-subset-dataset/pr_train.csv')
val_data = Dataset('/kaggle/input/thucnews-subset-dataset/pr_val.csv')
test_data = Dataset('/kaggle/input/thucnews-subset-dataset/pr_test.csv')
train_data.__getitem__(0)

('马晓旭 意外 受伤 国奥 警惕 无奈 大雨 格外 青睐 殷家 军 记者 傅亚雨 沈阳 报道 来到 沈阳 国奥队 依然 摆脱 雨水 困扰 月 日 下午 国奥队 日常 训练 再度 大雨 干扰 无奈 之下 队员 慢跑 分钟 草草收场 日 上午 国奥队 奥体中心 外场 训练 阴沉沉 气象预报 显示 当天 下午 沈阳 大雨 幸好 队伍 上午 训练 干扰 下午 点当 球队 抵达 训练场 大雨 几个 小时 丝毫 停下来 抱 试一试 态度 球队 当天 下午 例行 训练 分钟 天气 转好 迹象 保护 球员 国奥队 中止 当天 训练 全队 返回 酒店 雨 训练 足球队 稀罕 奥运会 即将 全队 变得 娇贵 沈阳 一周 训练 国奥队 保证 现有 球员 不再 出现意外 伤病 情况 影响 正式 比赛 这一 阶段 控制 训练 受伤 控制 感冒 疾病 队伍 放在 位置 抵达 沈阳 后卫 冯萧霆 训练 冯萧霆 月 日 长春 患上 感冒 参加 日 塞尔维亚 热身赛 队伍 介绍 冯萧霆 发烧 症状 两天 静养 休息 感冒 恢复 训练 冯萧霆 例子 国奥队 对雨中 训练 显得 谨慎 担心 球员 受凉 引发 感冒 非战斗 减员 女足 队员 马晓旭 热身赛 受伤 导致 无缘 奥运 前科 沈阳 国奥队 格外 警惕 训练 嘱咐 队员 动作 再出 事情 一位 工作人员 长春 沈阳 雨水 一路 伴随 国奥队 邪 走 雨 长春 几次 训练 都 大雨 搅和 没想到 沈阳 碰到 事情 一位 国奥 球员 雨水 青睐 不解',
 0)

### 2.2 分词和编码

In [4]:
mengzi_token = BertTokenizer.from_pretrained("Langboat/mengzi-bert-base-fin")

bert_bc_token = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False
)

#获取字典
#token_dict = tokenizer.get_vocab()
#type(token_dict), len(token_dict), '月光' in token_dict
#添加新词
#tokenizer.add_token(new_tokens=['月光','希望'])
#添加新符号
#tokenizer.add_special_tokens({'eos_token':'[EOS]'})

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

### 2.3 定义批处理函数

In [5]:
def bc_collate_fn(data):
  sents = [i[0] for i in data]
  labels = [i[1] for i in data]
  #编码
  data = bert_bc_token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=512,
                                 return_tensors='pt',
                                 return_length=True)
  #input_ids:编码之后的数字
  #attention_mask:补零的位置三0，其他位置是1
  input_ids = data['input_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  labels = torch.LongTensor(labels).to(device)

  return input_ids, attention_mask, token_type_ids, labels


def mengzi_collate_fn(data):
  sents = [i[0] for i in data]
  labels = [i[1] for i in data]
  #编码
  data = mengzi_token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=512,
                                 return_tensors='pt',
                                 return_length=True)
  
  #input_ids:编码之后的数字
  #attention_mask:补零的位置三0，其他位置是1
  input_ids = data['input_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  labels = torch.LongTensor(labels).to(device)

  return input_ids, attention_mask, token_type_ids, labels

#导入数据，这一步合并到trainner中了
train_loader_bertcn = DataLoader(dataset=train_data,
                    batch_size=128,
                    collate_fn=bc_collate_fn,
                    shuffle=True,
                    drop_last=True)

train_loader_mengzi = DataLoader(dataset=train_data,
                    batch_size=128,
                    collate_fn=mengzi_collate_fn,
                    shuffle=True,
                    drop_last=True)

val_loader = DataLoader(dataset=val_data,
                           batch_size=128,
                           collate_fn=mengzi_collate_fn,
                           shuffle=True,
                           drop_last=True)

test_loader = DataLoader(dataset=test_data,
                           batch_size=128,
                           collate_fn=mengzi_collate_fn,
                           shuffle=True,
                           drop_last=True)

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader_mengzi):
  break
print(len(train_loader_mengzi))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape


# for batch in train_dataloader:
#   batch = {k: v.to(device) for k, v in batch.items()}
#   outputs = model(**batch)

390


(torch.Size([128, 512]),
 torch.Size([128, 512]),
 torch.Size([128, 512]),
 torch.Size([128]))

## 3. 模型


### 3.1 加载预训练模型

In [6]:
#加载预训练模型
bert_bc_pretrained = BertModel.from_pretrained("bert-base-chinese").to(device)
mengzi_pretrained = BertModel.from_pretrained("Langboat/mengzi-bert-base-fin").to(device)

#不使用finetuning，直接冻结预训练模型的参数
for param in mengzi_pretrained.parameters():
  param.requires_grad_(False)

#模型试算
#out = bert_bc_pretrained(input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 token_type_ids=token_type_ids)

#out.last_hidden_state.shape

print('param_num: ' + str(sum([i.nelement() for i in mengzi_pretrained.parameters()]) / 10000))

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/414M [00:00<?, ?B/s]

Some weights of the model checkpoint at Langboat/mengzi-bert-base-fin were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'pos_transform.LayerNorm.weight', 'pos_transform.dense.bias', 'cls.predictions.decoder.weight', 'sop.cls.weight', 'cls.predictions.transform.dense.weight', 'pos_head.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'pos_head.bias', 'pos_transform.dense.weight', 'cls.predictions.bias', 'sop.cls.bias', 'cls.predictions.transform.LayerNorm.weight', 'pos_transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

param_num: 10226.7648


### 3.2 定义下游任务模型

In [7]:
class Downstream_Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc = nn.Linear(768,10)
  
  def forward(self, input_ids, attention_mask, token_type_ids):
    with torch.no_grad():
      out = mengzi_pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)
      
    out = self.fc(out.last_hidden_state[:, 0])

    out = out.softmax(dim=1)

    return out

model = Downstream_Model().to(device)

## 4. 训练下游任务模型

In [8]:
#定义优化器、损失函数、评价指标
#optimizer = AdamW(model.parameters(), lr=5e-4)
#loss_fn = nn.CrossEntropyLoss()
#metric = evaluate.load('accuracy')

# #初始化训练参数
# args = TrainingArguments(output_dir='./output_dir',
#                          overwrite_output_dir = False,
#                          evaluation_strategy='epoch',
#                          num_train_epochs = 10,
#                          learning_rate = 1e-4, #优化器默认为AdamW
#                          adam_beta1 = 0.9,
#                          adam_beta2 = 0.999,
#                          adam_epsilon = 1e-8,
#                          weight_decay = 1e-2, #各层的权重衰减
#                          max_grad_norm = 1.0, #梯度裁剪
#                          per_device_eval_batch_size = 64,
#                          per_device_train_batch_size = 64,
#                          lr_scheduler_type = 'linear',
#                          save_strategy = 'epoch',
#                          no_cuda = False,
#                          seed = 1024,
#                          data_seed = 1024,
#                          load_best_model_at_end = False,
#                          metric_for_best_model = 'loss',
#                          greater_is_better = False
#                          )

# #初始化训练器                         
# trainer = Trainer(
#     model = model,
#     args = args,
#     data_collator = mengzi_collate_fn, #构建batch
#     train_dataset = train_data,
#     eval_dataset = val_data,
#     compute_metrics = metric,
#     tokenizer = mengzi_token
#     #callbacks = 
#     #optimizers = 
# )

# trainer.train()

def train_func(model, loss_fn, optimizer, n_epochs, train_loader, var_loader=None):
    for t in range(n_epochs):
        print(f"Epoch {t+1}\n-------------------------------")

        model.train()
        size = len(train_loader.dataset)
        for batch, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
            out = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)

            loss = loss_fn(out, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 50 == 0:
                loss, current = loss.item(), (batch+1) * len(input_ids)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        if var_loader != None:
            test_func(dataloader=val_loader, model=model, loss_fn=loss_fn, tp = 'validation')
    print("\n training_finished")
    

def test_func(dataloader, model, loss_fn, tp='validation'):
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  with torch.no_grad(): #停止梯度计算
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader):
      pred = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)
      test_loss += loss_fn(pred, labels).item()
      correct += (pred.argmax(1) == labels).type(torch.float).sum().item()
  
  test_loss /= num_batches
  correct /= size

  if tp == 'test':
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
  elif tp == 'validation':
    print(f"Val Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [9]:
# for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader_mengzi):
#   pred = model(input_ids=input_ids,
#                   attention_mask=attention_mask,
#                   token_type_ids=token_type_ids)
#   loss = nn.CrossEntropyLoss(pred, labels)
#   break

In [10]:
train_func(train_loader = train_loader_mengzi,
           var_loader= val_loader, 
           model = model, 
           loss_fn = nn.CrossEntropyLoss(), 
           optimizer = AdamW(model.parameters(), lr=5e-4), 
           n_epochs=20)

Epoch 1
-------------------------------
loss: 2.299615  [  128/50000]
loss: 2.212399  [ 6528/50000]
loss: 2.162353  [12928/50000]
loss: 2.169923  [19328/50000]
loss: 2.088394  [25728/50000]
loss: 2.063717  [32128/50000]
loss: 2.087774  [38528/50000]
loss: 1.997293  [44928/50000]
Val Error: 
 Accuracy: 60.7%, Avg loss: 1.984406 

Epoch 2
-------------------------------
loss: 2.025533  [  128/50000]
loss: 1.942805  [ 6528/50000]
loss: 1.881027  [12928/50000]
loss: 1.927590  [19328/50000]
loss: 1.914324  [25728/50000]
loss: 1.850683  [32128/50000]
loss: 1.861994  [38528/50000]
loss: 1.872411  [44928/50000]
Val Error: 
 Accuracy: 67.1%, Avg loss: 1.885094 

Epoch 3
-------------------------------
loss: 1.838398  [  128/50000]
loss: 1.862510  [ 6528/50000]
loss: 1.776364  [12928/50000]
loss: 1.821004  [19328/50000]
loss: 1.785921  [25728/50000]
loss: 1.736787  [32128/50000]
loss: 1.800347  [38528/50000]
loss: 1.777494  [44928/50000]
Val Error: 
 Accuracy: 70.8%, Avg loss: 1.839214 

Epoch 4

## 5. 测试模型效果

In [11]:
test_func(dataloader=test_loader, 
          model=model, 
          loss_fn = nn.CrossEntropyLoss(), 
          tp='test')

Test Error: 
 Accuracy: 86.7%, Avg loss: 1.620288 

