<a href="https://colab.research.google.com/github/Virgil-L/llm-finetune-practice/blob/main/Finetune%E5%AE%9E%E6%88%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. 准备工作

### 1.1 环境准备


In [1]:
#关闭安装的输出
%%capture 
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torchinfo


### 1.2 函数库与gpu使用

In [2]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from transformers.trainer_utils import EvalPrediction
import evaluate

# 使用gpu进行训练
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
# gpu型号
!nvidia-smi


Using cpu device
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



### 1.3 goole drive连接

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. 数据预处理

### 2.1 数据集导入

In [6]:

class Dataset(torch.utils.data.Dataset):
  def __init__(self, split, filepath):
    self.dataset = load_dataset(path=filepath,split=split)

  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, i):
    text = self.dataset[i]['text']
    label = self.dataset[i]['label']

    return text, label


train_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'train')
val_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'validation')
test_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'test')



### 2.2 分词和编码

In [4]:
mengzi_token = BertTokenizer.from_pretrained("Langboat/mengzi-bert-base-fin")

bert_bc_token = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False
)

#获取字典
#token_dict = tokenizer.get_vocab()
#type(token_dict), len(token_dict), '月光' in token_dict
#添加新词
#tokenizer.add_token(new_tokens=['月光','希望'])
#添加新符号
#tokenizer.add_special_tokens({'eos_token':'[EOS]'})

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

### 2.3 定义批处理函数

In [36]:
def bc_collate_fn(data):
  sents = [i[0] for i in data]
  labels = [i[1] for i in data]
  #编码
  data = bert_bc_token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=512,
                                 return_tensors='pt',
                                 return_length=True)
  #input_ids:编码之后的数字
  #attention_mask:补零的位置三0，其他位置是1
  input_ids = data['input_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  labels = torch.LongTensor(labels).to(device)

  return input_ids, attention_mask, token_type_ids, labels


def mengzi_collate_fn(data):
  sents = [i[0] for i in data]
  labels = [i[1] for i in data]
  #编码
  data = mengzi_token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=512,
                                 return_tensors='pt',
                                 return_length=True)
  
  #input_ids:编码之后的数字
  #attention_mask:补零的位置三0，其他位置是1
  input_ids = data['input_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  labels = torch.LongTensor(labels).to(device)

  return input_ids, attention_mask, token_type_ids, labels

#导入数据，这一步合并到trainner中了
train_loader_bertcn = DataLoader(dataset=train_data,
                    batch_size=64,
                    collate_fn=bc_collate_fn,
                    shuffle=True,
                    drop_last=True)

train_loader_mengzi = DataLoader(dataset=train_data,
                    batch_size=64,
                    collate_fn=mengzi_collate_fn,
                    shuffle=True,
                    drop_last=True)

val_loader = DataLoader(dataset=val_data,
                           batch_size=64,
                           collate_fn=mengzi_collate_fn,
                           shuffle=True,
                           drop_last=True)

test_loader = DataLoader(dataset=test_data,
                           batch_size=64,
                           collate_fn=mengzi_collate_fn,
                           shuffle=True,
                           drop_last=True)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader_mengzi):
  break
print(len(train_loader_mengzi))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape


# for batch in train_dataloader:
#   batch = {k: v.to(device) for k, v in batch.items()}
#   outputs = model(**batch)

150


(torch.Size([64, 512]),
 torch.Size([64, 512]),
 torch.Size([64, 512]),
 torch.Size([64]))

## 3. 模型


### 3.1 加载预训练模型

In [37]:
#加载预训练模型
bert_bc_pretrained = BertModel.from_pretrained("bert-base-chinese").to(device)
mengzi_pretrained = BertModel.from_pretrained("Langboat/mengzi-bert-base-fin").to(device)

#不使用finetuning，直接冻结预训练模型的参数
for param in bert_bc_pretrained.parameters():
  param.requires_grad_(False)

#模型试算
#out = bert_bc_pretrained(input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 token_type_ids=token_type_ids)

#out.last_hidden_state.shape

print('param_num: ' + str(sum([i.nelement() for i in mengzi_pretrained.parameters()]) / 10000))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at Langboat/mengzi-bert-base-fin were not used when initializing BertModel: ['sop.cls.weight', 'sop.cls.bias', 'pos_tr

param_num: 10226.7648


### 3.2 定义下游任务模型

In [38]:
class Downstream_Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc = nn.Linear(768,2)
  
  def forward(self, input_ids, attention_mask, token_type_ids):
    with torch.no_grad():
      out = bert_bc_pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)
      
    out = self.fc(out.last_hidden_state[:, 0])

    out = out.softmax(dim=1)

    return out

model = Downstream_Model().to(device)

## 4. 训练下游任务模型

In [39]:
#定义优化器、损失函数、评价指标
#optimizer = AdamW(model.parameters(), lr=5e-4)
#loss_fn = nn.CrossEntropyLoss()
#metric = evaluate.load('accuracy')

# #初始化训练参数
# args = TrainingArguments(output_dir='./output_dir',
#                          overwrite_output_dir = False,
#                          evaluation_strategy='epoch',
#                          num_train_epochs = 10,
#                          learning_rate = 1e-4, #优化器默认为AdamW
#                          adam_beta1 = 0.9,
#                          adam_beta2 = 0.999,
#                          adam_epsilon = 1e-8,
#                          weight_decay = 1e-2, #各层的权重衰减
#                          max_grad_norm = 1.0, #梯度裁剪
#                          per_device_eval_batch_size = 64,
#                          per_device_train_batch_size = 64,
#                          lr_scheduler_type = 'linear',
#                          save_strategy = 'epoch',
#                          no_cuda = False,
#                          seed = 1024,
#                          data_seed = 1024,
#                          load_best_model_at_end = False,
#                          metric_for_best_model = 'loss',
#                          greater_is_better = False
#                          )

# #初始化训练器                         
# trainer = Trainer(
#     model = model,
#     args = args,
#     data_collator = mengzi_collate_fn, #构建batch
#     train_dataset = train_data,
#     eval_dataset = val_data,
#     compute_metrics = metric,
#     tokenizer = mengzi_token
#     #callbacks = 
#     #optimizers = 
# )

# trainer.train()

def train_func(model, loss_fn, optimizer, n_epochs, train_loader, var_loader=None):
  for t in range(n_epochs):
    print(f"Epoch {t+1}\n-------------------------------")


    model.train()
    size = len(train_loader.dataset)
    for batch, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
      out = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)

      loss = loss_fn(out, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if batch % 10 == 0:
        loss, current = loss.item(), (batch+1) * len(input_ids)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
  if var_loader != None:
    test_func(dataloader=val_loader, model=model, loss_fn=loss_fn, tp = 'validation')
  print("\n training_finished")
    

def test_func(dataloader, model, loss_fn, tp='validation'):
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  with torch.no_grad(): #停止梯度计算
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader):
      pred = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)
      test_loss += loss_fn(pred, labels).item()
      correct += (pred.argmax(1) == labels).type(torch.float).sum().item()
  
  test_loss /= num_batches
  correct /= size

  if tp == 'test':
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
  elif tp == 'validation':
    print(f"Val Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [None]:
# for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader_mengzi):
#   pred = model(input_ids=input_ids,
#                   attention_mask=attention_mask,
#                   token_type_ids=token_type_ids)
#   loss = nn.CrossEntropyLoss(pred, labels)
#   break

In [None]:
train_func(train_loader = train_loader_mengzi,
           var_loader=None, 
           model = model, 
           loss_fn = nn.CrossEntropyLoss(), 
           optimizer = AdamW(model.parameters(), lr=5e-4), 
           n_epochs=10)

Epoch 1
-------------------------------
loss: 0.705030  [    0/ 9600]
loss: 0.640783  [  640/ 9600]


## 5. 测试模型效果

In [None]:
test_func(dataloader=test_loader, 
          model=model, 
          loss_fn = nn.CrossEntropyLoss(), 
          tp='test')