<a href="https://colab.research.google.com/github/Virgil-L/llm-finetune-practice/blob/main/BBT_FinT5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. 准备工作

### 1.1 环境准备


In [None]:
#关闭安装的输出
%%capture 
!pip install transformers
# !pip install evaluate
# !pip install torchinfo


### 1.2 函数库与gpu使用

In [None]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer
from transformers.trainer_utils import EvalPrediction
# import evaluate

# 使用gpu进行训练
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
# gpu型号
!nvidia-smi


Using cpu device
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



### 1.3 goole drive连接

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os 
path="/content/drive/My Drive"
os.chdir(path)
os.listdir(path)

['Colab Notebooks', 'data', 'model']

## 2. 数据预处理

### 2.1 数据集导入

In [13]:
import pandas as pd
finzhidao_df = pd.read_csv('./data/百度知道问答数据/financezhidao_filter.csv',encoding='utf-8')
# 问题为空时用标题作为问题
finzhidao_df.loc[(finzhidao_df['question'].isnull()),'question']=finzhidao_df.loc[(finzhidao_df['question'].isnull()),'title']
# 删去标题
finzhidao_df = finzhidao_df.drop(columns='title')
print(finzhidao_df['is_best'].describe())
finzhidao_df.head()

count    768427.000000
mean          0.457889
std           0.498224
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: is_best, dtype: float64


Unnamed: 0,question,reply,is_best
0,壹方水榭可以用深*的公积金贷款吗，壹方水榭二套房,由于各城市相关政策有所不同，具体您所在当地是否有开展公积金贷款业务，以及相关业务规定，您可以...,0
1,了多久能放款，为什么迟迟不放款,我行贷款如您提交申请资料齐全审批大约15个工作日左右，因为中间涉及评估、办理抵押登记等环节，...,0
2,快速借款逾期有多严重,若是我行贷款，如有消费，请您按时、足额在到期还款日之前还款。首先逾期会生成不良信用记录，影响...,0
3,转入多久到账，能查询进度吗,若是招行转出，网上银行/手机银行转账汇款到账时间:同行转账:无论同城或异地，转入个人账户实时...,0
4,悦借钱骗人的,涉及利益都不太好,0


In [None]:


# class Dataset(torch.utils.data.Dataset):
#   def __init__(self, df):
#     question = df['question'].values
#     reply = df['reply'].values

#   def __len__(self):
#     return len(self.dataset)
  
#   def __getitem__(self, i):
#     text = 
#     label = 

#     return text, label


# # train_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'train')
# # val_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'validation')
# # test_data = Dataset(filepath = 'seamew/ChnSentiCorp',split = 'test')

Downloading builder script:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading and preparing dataset chn_senti_corp/default to /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85...


Downloading data:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/371k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset chn_senti_corp downloaded and prepared to /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85. Subsequent calls will reuse this data.




### 2.2 分词和编码

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='./model/FinMT5_base',
    cache_dir=None,
    force_download=False
)

#获取字典
#token_dict = tokenizer.get_vocab()
#type(token_dict), len(token_dict), '月光' in token_dict
#添加新词
#tokenizer.add_token(new_tokens=['月光','希望'])
#添加新符号
#tokenizer.add_special_tokens({'eos_token':'[EOS]'})

In [None]:
sents = '三月份招行信用卡还款逾期了，会产生多少费用？对以后开户有影响吗?'
token = tokenizer.encode(sents,
                         return_tensors='pt',
                         return_length=True)
token

tensor([[ 101,  676, 3299,  819, 2875, 6121,  928, 4500, 1305, 6820, 3621, 6874,
         3309,  749, 8024,  833,  772, 4495, 1914, 2208, 6589, 4500, 8043, 2190,
          809, 1400, 2458, 2787, 3300, 2512, 1510, 1408,  136,  102]])

### 2.3 定义批处理函数

In [None]:
def collate_fn(data):
  sents = [i[0] for i in data]
  labels = [i[1] for i in data]
  #编码
  data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=512,
                                 return_tensors='pt',
                                 return_length=True)
  #input_ids:编码之后的数字
  #attention_mask:补零的位置三0，其他位置是1
  input_ids = data['input_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  labels = torch.LongTensor(labels).to(device)

  return input_ids, attention_mask, token_type_ids, labels


#导入数据，这一步合并到trainner中了


train_loader = DataLoader(dataset=train_data,
                    batch_size=64,
                    collate_fn=collate_fn,
                    shuffle=True,
                    drop_last=True)

val_loader = DataLoader(dataset=val_data,
                           batch_size=64,
                           collate_fn=collate_fn,
                           shuffle=True,
                           drop_last=True)

test_loader = DataLoader(dataset=test_data,
                           batch_size=64,
                           collate_fn=collate_fn,
                           shuffle=True,
                           drop_last=True)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
  break
print(len(train_loader_mengzi))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape


# for batch in train_dataloader:
#   batch = {k: v.to(device) for k, v in batch.items()}
#   outputs = model(**batch)

150


(torch.Size([64, 512]),
 torch.Size([64, 512]),
 torch.Size([64, 512]),
 torch.Size([64]))

## 3. 模型


### 3.1 加载预训练模型

In [16]:
#加载预训练模型
pretrained_model = AutoModel.from_pretrained("./model/FinMT5_base").to(device)


#不使用finetuning，直接冻结预训练模型的参数
for param in pretrained_model.parameters():
  param.requires_grad_(False)

#模型试算
#out = bert_bc_pretrained(input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 token_type_ids=token_type_ids)

#out.last_hidden_state.shape

print('param_num: ' + str(sum([i.nelement() for i in pretrained_model.parameters()]) / 10000))

Some weights of the model checkpoint at ./model/FinMT5_base were not used when initializing MT5Model: ['lm_head.weight']
- This IS expected if you are initializing MT5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MT5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


param_num: 21453.2352


In [18]:
input_text = "怎样看待市场、政府调控对经济运行的影响？"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = pretrained_model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

TypeError: ignored

### 3.2 定义下游任务模型

In [None]:
class Downstream_Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc = nn.Linear(768,2) #
  
  def forward(self, input_ids, attention_mask, token_type_ids):
    with torch.no_grad():
      out = pretrained_model(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)
      
    out = self.fc(out.last_hidden_state[:, 0])

    out = out.softmax(dim=1)

    return out

model = Downstream_Model().to(device)

## 4. 训练下游任务模型

In [None]:
#定义优化器、损失函数、评价指标
#optimizer = AdamW(model.parameters(), lr=5e-4)
#loss_fn = nn.CrossEntropyLoss()
#metric = evaluate.load('accuracy')

# #初始化训练参数
# args = TrainingArguments(output_dir='./output_dir',
#                          overwrite_output_dir = False,
#                          evaluation_strategy='epoch',
#                          num_train_epochs = 10,
#                          learning_rate = 1e-4, #优化器默认为AdamW
#                          adam_beta1 = 0.9,
#                          adam_beta2 = 0.999,
#                          adam_epsilon = 1e-8,
#                          weight_decay = 1e-2, #各层的权重衰减
#                          max_grad_norm = 1.0, #梯度裁剪
#                          per_device_eval_batch_size = 64,
#                          per_device_train_batch_size = 64,
#                          lr_scheduler_type = 'linear',
#                          save_strategy = 'epoch',
#                          no_cuda = False,
#                          seed = 1024,
#                          data_seed = 1024,
#                          load_best_model_at_end = False,
#                          metric_for_best_model = 'loss',
#                          greater_is_better = False
#                          )

# #初始化训练器                         
# trainer = Trainer(
#     model = model,
#     args = args,
#     data_collator = mengzi_collate_fn, #构建batch
#     train_dataset = train_data,
#     eval_dataset = val_data,
#     compute_metrics = metric,
#     tokenizer = mengzi_token
#     #callbacks = 
#     #optimizers = 
# )

# trainer.train()

def train_func(model, loss_fn, optimizer, n_epochs, train_loader, var_loader=None):
  for t in range(n_epochs):
    print(f"Epoch {t+1}\n-------------------------------")


    model.train()
    size = len(train_loader.dataset)
    for batch, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
      out = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)

      loss = loss_fn(out, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if batch % 10 == 0:
        loss, current = loss.item(), (batch) * len(input_ids)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
  if var_loader != None:
    test_func(dataloader=val_loader, model=model, loss_fn=loss_fn, tp = 'validation')
  print("\n training_finished")
    

def test_func(dataloader, model, loss_fn, tp='validation'):
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  with torch.no_grad(): #停止梯度计算
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader):
      pred = model(input_ids=input_ids,
                  attention_mask=attention_mask,
                  token_type_ids=token_type_ids)
      test_loss += loss_fn(pred, labels).item()
      correct += (pred.argmax(1) == labels).type(torch.float).sum().item()
  
  test_loss /= num_batches
  correct /= size

  if tp == 'test':
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
  elif tp == 'validation':
    print(f"Val Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [None]:
# for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader_mengzi):
#   pred = model(input_ids=input_ids,
#                   attention_mask=attention_mask,
#                   token_type_ids=token_type_ids)
#   loss = nn.CrossEntropyLoss(pred, labels)
#   break

In [None]:
train_func(train_loader = train_loader_mengzi,
           var_loader=None, 
           model = model, 
           loss_fn = nn.CrossEntropyLoss(), 
           optimizer = AdamW(model.parameters(), lr=5e-4), 
           n_epochs=10)

Epoch 1
-------------------------------
loss: 0.705030  [    0/ 9600]


## 5. 测试模型效果

In [None]:
test_func(dataloader=test_loader, 
          model=model, 
          loss_fn = nn.CrossEntropyLoss(), 
          tp='test')