In [None]:
import sys
import torch
sys.version

# 0. 基本常用Snippets

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Check if MPS (Metal Performance Shaders) is available and use it if possible
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [29]:
from torch.utils.data import DataLoader

def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

train_dataset = dataset["train"].map(preprocess_function, batched=True)
test_dataset = dataset["validation"].map(preprocess_function, batched=True)

def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)


# 1. LoRA(CLS任务) - 基本BERT模型测试

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
# 创建较小数据集进行微调
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [4]:
from peft import LoraConfig, TaskType

# task_Type: 指定模型将进行微调的任务类型
# r: 表示 A 和 B 的尺寸
# lora_alpha: 比例因子，确定“A”和“B”中的权重相对于模型原始参数的相对显著性
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)

In [7]:
from transformers import BertForSequenceClassification

# 检查是否可以使用 MPS 设备
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
    print("Using MPS device")
else:
    device = torch.device('cpu')
    print("Using CPU")
    
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=2
).to(device)

# 计算可训练参数数量
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'原始大模型的可训练参数数量: {trainable_params}')

Using MPS device


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


原始大模型的可训练参数数量: 108311810


In [8]:
# 将A、B矩阵插入大模型
from peft import get_peft_model
model = get_peft_model(model, lora_config).to(device)

# 计算可训练参数数量
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'引入LoRA后大模型的可训练参数数量为: {trainable_params}')

引入LoRA后大模型的可训练参数数量为: 38402


In [46]:
# mac用不了Accelerate(至少我没找到解决方法)
# 频繁报错ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

# import numpy as np
# import evaluate

# metric = evaluate.load("accuracy")

# # 将预测转换成对数
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# from transformers import TrainingArguments, Trainer

# # TrainingArguments可用于自定义的综合超参数，以及用于激活不同训练配置的切换选项
# training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",
#                                  num_train_epochs=25,)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics,
# )

Using the latest cached version of the module from /Users/euan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Fri Apr 12 10:17:38 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [10]:
import numpy as np
import evaluate
from tqdm import tqdm
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch], dim=0)
    attention_mask = torch.stack([item["attention_mask"] for item in batch], dim=0)
    label = torch.tensor([item["label"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": label}
metric = evaluate.load("accuracy")

# 将预测转换成对数
def compute_metrics(predictions, labels):
    return metric.compute(predictions=predictions, references=labels)

from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

# 检查模型是否在正确的设备上
print("Model device:", next(model.parameters()).device)

# 定义优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 设置训练超参数
num_epochs = 25
batch_size = 8

# 创建 TensorDataset
train_input_ids = torch.stack([torch.tensor(t) for t in small_train_dataset["input_ids"]], dim=0).to(device)
train_attention_mask = torch.stack([torch.tensor(t) for t in small_train_dataset["attention_mask"]], dim=0).to(device)
train_labels = torch.tensor(small_train_dataset["label"], dtype=torch.long).to(device)

eval_input_ids = torch.stack([torch.tensor(t) for t in small_eval_dataset["input_ids"]], dim=0).to(device)
eval_attention_mask = torch.stack([torch.tensor(t) for t in small_eval_dataset["attention_mask"]], dim=0).to(device)
eval_labels = torch.tensor(small_eval_dataset["label"], dtype=torch.long).to(device)

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
eval_dataset = TensorDataset(eval_input_ids, eval_attention_mask, eval_labels)

# 创建 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)

# 训练模型
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_samples = 0
    
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    for batch in train_progress_bar:
        optimizer.zero_grad()
        
        input_ids, attention_mask, label = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)
        
        output = model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
        loss = output.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_samples += input_ids.size(0)
        
        train_progress_bar.set_postfix({"Loss": loss.item()})
    
    avg_train_loss = total_loss / total_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}")
    
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch+1}/{num_epochs}", unit="batch"):
            input_ids, attention_mask, label = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(output.logits, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()
        
        eval_accuracy = correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Eval Accuracy: {eval_accuracy:.4f}")


Epoch 11/25:  95%|█████████████████████▉ | 119/125 [01:54<00:05,  1.06batch/s, Loss=0.544][A
Epoch 11/25:  96%|██████████████████████ | 120/125 [01:54<00:04,  1.07batch/s, Loss=0.544][A
Epoch 11/25:  96%|██████████████████████ | 120/125 [01:55<00:04,  1.07batch/s, Loss=0.581][A
Epoch 11/25:  97%|██████████████████████▎| 121/125 [01:55<00:03,  1.07batch/s, Loss=0.581][A
Epoch 11/25:  97%|███████████████████████▏| 121/125 [01:56<00:03,  1.07batch/s, Loss=0.73][A
Epoch 11/25:  98%|███████████████████████▍| 122/125 [01:56<00:02,  1.06batch/s, Loss=0.73][A
Epoch 11/25:  98%|██████████████████████▍| 122/125 [01:57<00:02,  1.06batch/s, Loss=0.619][A
Epoch 11/25:  98%|██████████████████████▋| 123/125 [01:57<00:01,  1.07batch/s, Loss=0.619][A
Epoch 11/25:  98%|███████████████████████▌| 123/125 [01:58<00:01,  1.07batch/s, Loss=0.66][A
Epoch 11/25:  99%|███████████████████████▊| 124/125 [01:58<00:00,  1.07batch/s, Loss=0.66][A
Epoch 11/25:  99%|██████████████████████▊| 124/125 [01:59<0

Epoch [11/25], Train Loss: 0.0781



Evaluating Epoch 11/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 11/25:   1%|▏                         | 1/125 [00:00<00:43,  2.83batch/s][A
Evaluating Epoch 11/25:   2%|▍                         | 2/125 [00:00<00:43,  2.85batch/s][A
Evaluating Epoch 11/25:   2%|▌                         | 3/125 [00:01<00:42,  2.86batch/s][A
Evaluating Epoch 11/25:   3%|▊                         | 4/125 [00:01<00:42,  2.86batch/s][A
Evaluating Epoch 11/25:   4%|█                         | 5/125 [00:01<00:41,  2.87batch/s][A
Evaluating Epoch 11/25:   5%|█▏                        | 6/125 [00:02<00:41,  2.88batch/s][A
Evaluating Epoch 11/25:   6%|█▍                        | 7/125 [00:02<00:41,  2.88batch/s][A
Evaluating Epoch 11/25:   6%|█▋                        | 8/125 [00:02<00:40,  2.88batch/s][A
Evaluating Epoch 11/25:   7%|█▊                        | 9/125 [00:03<00:40,  2.88batch/s][A
Evaluating Epoch 11/25:   8%|██                       | 10/

Epoch [11/25], Eval Accuracy: 0.6920



Epoch 12/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 12/25:   0%|                                 | 0/125 [00:00<?, ?batch/s, Loss=0.707][A
Epoch 12/25:   1%|▏                        | 1/125 [00:00<01:52,  1.10batch/s, Loss=0.707][A
Epoch 12/25:   1%|▏                        | 1/125 [00:01<01:52,  1.10batch/s, Loss=0.754][A
Epoch 12/25:   2%|▍                        | 2/125 [00:01<01:50,  1.11batch/s, Loss=0.754][A
Epoch 12/25:   2%|▍                        | 2/125 [00:02<01:50,  1.11batch/s, Loss=0.638][A
Epoch 12/25:   2%|▌                        | 3/125 [00:02<01:49,  1.11batch/s, Loss=0.638][A
Epoch 12/25:   2%|▌                         | 3/125 [00:03<01:49,  1.11batch/s, Loss=0.59][A
Epoch 12/25:   3%|▊                         | 4/125 [00:03<01:48,  1.11batch/s, Loss=0.59][A
Epoch 12/25:   3%|▊                        | 4/125 [00:04<01:48,  1.11batch/s, Loss=0.604][A
Epoch 12/25:   4%|█                        | 5/125 [00:04<0

Epoch [12/25], Train Loss: 0.0778



Evaluating Epoch 12/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 12/25:   1%|▏                         | 1/125 [00:00<00:43,  2.88batch/s][A
Evaluating Epoch 12/25:   2%|▍                         | 2/125 [00:00<00:42,  2.88batch/s][A
Evaluating Epoch 12/25:   2%|▌                         | 3/125 [00:01<00:42,  2.88batch/s][A
Evaluating Epoch 12/25:   3%|▊                         | 4/125 [00:01<00:42,  2.88batch/s][A
Evaluating Epoch 12/25:   4%|█                         | 5/125 [00:01<00:41,  2.88batch/s][A
Evaluating Epoch 12/25:   5%|█▏                        | 6/125 [00:02<00:41,  2.88batch/s][A
Evaluating Epoch 12/25:   6%|█▍                        | 7/125 [00:02<00:40,  2.88batch/s][A
Evaluating Epoch 12/25:   6%|█▋                        | 8/125 [00:02<00:40,  2.88batch/s][A
Evaluating Epoch 12/25:   7%|█▊                        | 9/125 [00:03<00:40,  2.88batch/s][A
Evaluating Epoch 12/25:   8%|██                       | 10/

Epoch [12/25], Eval Accuracy: 0.7110



Epoch 13/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 13/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.765][A
Epoch 13/25:   1%|▏                        | 1/125 [00:01<02:50,  1.37s/batch, Loss=0.765][A
Epoch 13/25:   1%|▏                        | 1/125 [00:02<02:50,  1.37s/batch, Loss=0.894][A
Epoch 13/25:   2%|▍                        | 2/125 [00:02<02:16,  1.11s/batch, Loss=0.894][A
Epoch 13/25:   2%|▍                         | 2/125 [00:03<02:16,  1.11s/batch, Loss=0.76][A
Epoch 13/25:   2%|▌                         | 3/125 [00:03<02:04,  1.02s/batch, Loss=0.76][A
Epoch 13/25:   2%|▌                        | 3/125 [00:04<02:04,  1.02s/batch, Loss=0.402][A
Epoch 13/25:   3%|▊                        | 4/125 [00:04<01:58,  1.02batch/s, Loss=0.402][A
Epoch 13/25:   3%|▊                        | 4/125 [00:05<01:58,  1.02batch/s, Loss=0.563][A
Epoch 13/25:   4%|█                        | 5/125 [00:05<0

Epoch [13/25], Train Loss: 0.0758



Evaluating Epoch 13/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 13/25:   1%|▏                         | 1/125 [00:00<00:45,  2.75batch/s][A
Evaluating Epoch 13/25:   2%|▍                         | 2/125 [00:00<00:44,  2.75batch/s][A
Evaluating Epoch 13/25:   2%|▌                         | 3/125 [00:01<00:44,  2.75batch/s][A
Evaluating Epoch 13/25:   3%|▊                         | 4/125 [00:01<00:44,  2.75batch/s][A
Evaluating Epoch 13/25:   4%|█                         | 5/125 [00:01<00:43,  2.75batch/s][A
Evaluating Epoch 13/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.75batch/s][A
Evaluating Epoch 13/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 13/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 13/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.74batch/s][A
Evaluating Epoch 13/25:   8%|██                       | 10/

Epoch [13/25], Eval Accuracy: 0.7280



Epoch 14/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 14/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.527][A
Epoch 14/25:   1%|▏                        | 1/125 [00:01<03:12,  1.55s/batch, Loss=0.527][A
Epoch 14/25:   1%|▏                         | 1/125 [00:02<03:12,  1.55s/batch, Loss=0.52][A
Epoch 14/25:   2%|▍                         | 2/125 [00:02<02:27,  1.20s/batch, Loss=0.52][A
Epoch 14/25:   2%|▍                         | 2/125 [00:03<02:27,  1.20s/batch, Loss=0.63][A
Epoch 14/25:   2%|▌                         | 3/125 [00:03<02:12,  1.09s/batch, Loss=0.63][A
Epoch 14/25:   2%|▌                        | 3/125 [00:04<02:12,  1.09s/batch, Loss=0.613][A
Epoch 14/25:   3%|▊                        | 4/125 [00:04<02:05,  1.03s/batch, Loss=0.613][A
Epoch 14/25:   3%|▊                         | 4/125 [00:05<02:05,  1.03s/batch, Loss=0.45][A
Epoch 14/25:   4%|█                         | 5/125 [00:05<

Epoch [14/25], Train Loss: 0.0720



Evaluating Epoch 14/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 14/25:   1%|▏                         | 1/125 [00:00<00:45,  2.70batch/s][A
Evaluating Epoch 14/25:   2%|▍                         | 2/125 [00:00<00:45,  2.73batch/s][A
Evaluating Epoch 14/25:   2%|▌                         | 3/125 [00:01<00:44,  2.73batch/s][A
Evaluating Epoch 14/25:   3%|▊                         | 4/125 [00:01<00:44,  2.74batch/s][A
Evaluating Epoch 14/25:   4%|█                         | 5/125 [00:01<00:43,  2.74batch/s][A
Evaluating Epoch 14/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.74batch/s][A
Evaluating Epoch 14/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 14/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 14/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.75batch/s][A
Evaluating Epoch 14/25:   8%|██                       | 10/

Epoch [14/25], Eval Accuracy: 0.7380



Epoch 15/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 15/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.576][A
Epoch 15/25:   1%|▏                        | 1/125 [00:01<03:22,  1.63s/batch, Loss=0.576][A
Epoch 15/25:   1%|▏                        | 1/125 [00:02<03:22,  1.63s/batch, Loss=0.635][A
Epoch 15/25:   2%|▍                        | 2/125 [00:02<02:32,  1.24s/batch, Loss=0.635][A
Epoch 15/25:   2%|▍                        | 2/125 [00:03<02:32,  1.24s/batch, Loss=0.515][A
Epoch 15/25:   2%|▌                        | 3/125 [00:03<02:14,  1.10s/batch, Loss=0.515][A
Epoch 15/25:   2%|▌                        | 3/125 [00:04<02:14,  1.10s/batch, Loss=0.517][A
Epoch 15/25:   3%|▊                        | 4/125 [00:04<02:06,  1.04s/batch, Loss=0.517][A
Epoch 15/25:   3%|▊                        | 4/125 [00:05<02:06,  1.04s/batch, Loss=0.607][A
Epoch 15/25:   4%|█                        | 5/125 [00:05<0

Epoch [15/25], Train Loss: 0.0708



Evaluating Epoch 15/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 15/25:   1%|▏                         | 1/125 [00:00<00:45,  2.74batch/s][A
Evaluating Epoch 15/25:   2%|▍                         | 2/125 [00:00<00:44,  2.74batch/s][A
Evaluating Epoch 15/25:   2%|▌                         | 3/125 [00:01<00:44,  2.74batch/s][A
Evaluating Epoch 15/25:   3%|▊                         | 4/125 [00:01<00:44,  2.74batch/s][A
Evaluating Epoch 15/25:   4%|█                         | 5/125 [00:01<00:43,  2.74batch/s][A
Evaluating Epoch 15/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.74batch/s][A
Evaluating Epoch 15/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.74batch/s][A
Evaluating Epoch 15/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.74batch/s][A
Evaluating Epoch 15/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.75batch/s][A
Evaluating Epoch 15/25:   8%|██                       | 10/

Epoch [15/25], Eval Accuracy: 0.7380



Epoch 16/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 16/25:   0%|                                  | 0/125 [00:01<?, ?batch/s, Loss=0.49][A
Epoch 16/25:   1%|▏                         | 1/125 [00:01<02:09,  1.04s/batch, Loss=0.49][A
Epoch 16/25:   1%|▏                        | 1/125 [00:01<02:09,  1.04s/batch, Loss=0.579][A
Epoch 16/25:   2%|▍                        | 2/125 [00:01<02:01,  1.01batch/s, Loss=0.579][A
Epoch 16/25:   2%|▍                        | 2/125 [00:02<02:01,  1.01batch/s, Loss=0.334][A
Epoch 16/25:   2%|▌                        | 3/125 [00:02<01:58,  1.03batch/s, Loss=0.334][A
Epoch 16/25:   2%|▌                         | 3/125 [00:03<01:58,  1.03batch/s, Loss=0.61][A
Epoch 16/25:   3%|▊                         | 4/125 [00:03<01:56,  1.04batch/s, Loss=0.61][A
Epoch 16/25:   3%|▊                        | 4/125 [00:04<01:56,  1.04batch/s, Loss=0.437][A
Epoch 16/25:   4%|█                        | 5/125 [00:04<0

Epoch [16/25], Train Loss: 0.0678



Evaluating Epoch 16/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 16/25:   1%|▏                         | 1/125 [00:00<00:45,  2.75batch/s][A
Evaluating Epoch 16/25:   2%|▍                         | 2/125 [00:00<00:44,  2.74batch/s][A
Evaluating Epoch 16/25:   2%|▌                         | 3/125 [00:01<00:44,  2.74batch/s][A
Evaluating Epoch 16/25:   3%|▊                         | 4/125 [00:01<00:44,  2.74batch/s][A
Evaluating Epoch 16/25:   4%|█                         | 5/125 [00:01<00:43,  2.75batch/s][A
Evaluating Epoch 16/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.75batch/s][A
Evaluating Epoch 16/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 16/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.75batch/s][A
Evaluating Epoch 16/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.75batch/s][A
Evaluating Epoch 16/25:   8%|██                       | 10/

Epoch [16/25], Eval Accuracy: 0.7520



Epoch 17/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 17/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.536][A
Epoch 17/25:   1%|▏                        | 1/125 [00:01<03:13,  1.56s/batch, Loss=0.536][A
Epoch 17/25:   1%|▏                        | 1/125 [00:02<03:13,  1.56s/batch, Loss=0.328][A
Epoch 17/25:   2%|▍                        | 2/125 [00:02<02:27,  1.20s/batch, Loss=0.328][A
Epoch 17/25:   2%|▍                        | 2/125 [00:03<02:27,  1.20s/batch, Loss=0.481][A
Epoch 17/25:   2%|▌                        | 3/125 [00:03<02:12,  1.09s/batch, Loss=0.481][A
Epoch 17/25:   2%|▌                        | 3/125 [00:04<02:12,  1.09s/batch, Loss=0.525][A
Epoch 17/25:   3%|▊                        | 4/125 [00:04<02:05,  1.03s/batch, Loss=0.525][A
Epoch 17/25:   3%|▊                        | 4/125 [00:05<02:05,  1.03s/batch, Loss=0.433][A
Epoch 17/25:   4%|█                        | 5/125 [00:05<0

Epoch [17/25], Train Loss: 0.0663



Evaluating Epoch 17/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 17/25:   1%|▏                         | 1/125 [00:00<00:46,  2.69batch/s][A
Evaluating Epoch 17/25:   2%|▍                         | 2/125 [00:00<00:45,  2.71batch/s][A
Evaluating Epoch 17/25:   2%|▌                         | 3/125 [00:01<00:44,  2.71batch/s][A
Evaluating Epoch 17/25:   3%|▊                         | 4/125 [00:01<00:44,  2.71batch/s][A
Evaluating Epoch 17/25:   4%|█                         | 5/125 [00:01<00:44,  2.69batch/s][A
Evaluating Epoch 17/25:   5%|█▏                        | 6/125 [00:02<00:44,  2.68batch/s][A
Evaluating Epoch 17/25:   6%|█▍                        | 7/125 [00:02<00:43,  2.70batch/s][A
Evaluating Epoch 17/25:   6%|█▋                        | 8/125 [00:02<00:43,  2.70batch/s][A
Evaluating Epoch 17/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.71batch/s][A
Evaluating Epoch 17/25:   8%|██                       | 10/

Epoch [17/25], Eval Accuracy: 0.7540



Epoch 18/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 18/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.566][A
Epoch 18/25:   1%|▏                        | 1/125 [00:01<03:08,  1.52s/batch, Loss=0.566][A
Epoch 18/25:   1%|▏                        | 1/125 [00:02<03:08,  1.52s/batch, Loss=0.483][A
Epoch 18/25:   2%|▍                        | 2/125 [00:02<02:25,  1.18s/batch, Loss=0.483][A
Epoch 18/25:   2%|▍                        | 2/125 [00:03<02:25,  1.18s/batch, Loss=0.963][A
Epoch 18/25:   2%|▌                        | 3/125 [00:03<02:11,  1.08s/batch, Loss=0.963][A
Epoch 18/25:   2%|▌                        | 3/125 [00:04<02:11,  1.08s/batch, Loss=0.674][A
Epoch 18/25:   3%|▊                        | 4/125 [00:04<02:04,  1.03s/batch, Loss=0.674][A
Epoch 18/25:   3%|▊                        | 4/125 [00:05<02:04,  1.03s/batch, Loss=0.678][A
Epoch 18/25:   4%|█                        | 5/125 [00:05<0

Epoch [18/25], Train Loss: 0.0654



Evaluating Epoch 18/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 18/25:   1%|▏                         | 1/125 [00:00<00:46,  2.64batch/s][A
Evaluating Epoch 18/25:   2%|▍                         | 2/125 [00:00<00:46,  2.67batch/s][A
Evaluating Epoch 18/25:   2%|▌                         | 3/125 [00:01<00:45,  2.68batch/s][A
Evaluating Epoch 18/25:   3%|▊                         | 4/125 [00:01<00:44,  2.69batch/s][A
Evaluating Epoch 18/25:   4%|█                         | 5/125 [00:01<00:44,  2.71batch/s][A
Evaluating Epoch 18/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.72batch/s][A
Evaluating Epoch 18/25:   6%|█▍                        | 7/125 [00:02<00:43,  2.72batch/s][A
Evaluating Epoch 18/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.73batch/s][A
Evaluating Epoch 18/25:   7%|█▊                        | 9/125 [00:03<00:42,  2.74batch/s][A
Evaluating Epoch 18/25:   8%|██                       | 10/

Epoch [18/25], Eval Accuracy: 0.7660



Epoch 19/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 19/25:   0%|                                  | 0/125 [00:00<?, ?batch/s, Loss=0.45][A
Epoch 19/25:   1%|▏                         | 1/125 [00:00<01:58,  1.05batch/s, Loss=0.45][A
Epoch 19/25:   1%|▏                        | 1/125 [00:01<01:58,  1.05batch/s, Loss=0.344][A
Epoch 19/25:   2%|▍                        | 2/125 [00:01<01:55,  1.07batch/s, Loss=0.344][A
Epoch 19/25:   2%|▍                        | 2/125 [00:02<01:55,  1.07batch/s, Loss=0.459][A
Epoch 19/25:   2%|▌                        | 3/125 [00:02<01:53,  1.08batch/s, Loss=0.459][A
Epoch 19/25:   2%|▌                        | 3/125 [00:03<01:53,  1.08batch/s, Loss=0.233][A
Epoch 19/25:   3%|▊                        | 4/125 [00:03<01:51,  1.08batch/s, Loss=0.233][A
Epoch 19/25:   3%|▊                        | 4/125 [00:04<01:51,  1.08batch/s, Loss=0.278][A
Epoch 19/25:   4%|█                        | 5/125 [00:04<0

Epoch [19/25], Train Loss: 0.0627



Evaluating Epoch 19/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 19/25:   1%|▏                         | 1/125 [00:00<00:44,  2.82batch/s][A
Evaluating Epoch 19/25:   2%|▍                         | 2/125 [00:00<00:43,  2.81batch/s][A
Evaluating Epoch 19/25:   2%|▌                         | 3/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 19/25:   3%|▊                         | 4/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 19/25:   4%|█                         | 5/125 [00:01<00:42,  2.81batch/s][A
Evaluating Epoch 19/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 19/25:   6%|█▍                        | 7/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 19/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 19/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.80batch/s][A
Evaluating Epoch 19/25:   8%|██                       | 10/

Epoch [19/25], Eval Accuracy: 0.7710



Epoch 20/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 20/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.522][A
Epoch 20/25:   1%|▏                        | 1/125 [00:01<02:04,  1.01s/batch, Loss=0.522][A
Epoch 20/25:   1%|▏                         | 1/125 [00:01<02:04,  1.01s/batch, Loss=0.54][A
Epoch 20/25:   2%|▍                         | 2/125 [00:01<01:57,  1.04batch/s, Loss=0.54][A
Epoch 20/25:   2%|▍                        | 2/125 [00:02<01:57,  1.04batch/s, Loss=0.541][A
Epoch 20/25:   2%|▌                        | 3/125 [00:02<01:55,  1.06batch/s, Loss=0.541][A
Epoch 20/25:   2%|▌                        | 3/125 [00:03<01:55,  1.06batch/s, Loss=0.234][A
Epoch 20/25:   3%|▊                        | 4/125 [00:03<01:53,  1.07batch/s, Loss=0.234][A
Epoch 20/25:   3%|▊                        | 4/125 [00:04<01:53,  1.07batch/s, Loss=0.654][A
Epoch 20/25:   4%|█                        | 5/125 [00:04<0

Epoch [20/25], Train Loss: 0.0607



Evaluating Epoch 20/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 20/25:   1%|▏                         | 1/125 [00:00<00:44,  2.81batch/s][A
Evaluating Epoch 20/25:   2%|▍                         | 2/125 [00:00<00:43,  2.81batch/s][A
Evaluating Epoch 20/25:   2%|▌                         | 3/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 20/25:   3%|▊                         | 4/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 20/25:   4%|█                         | 5/125 [00:01<00:42,  2.81batch/s][A
Evaluating Epoch 20/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.79batch/s][A
Evaluating Epoch 20/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.80batch/s][A
Evaluating Epoch 20/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.80batch/s][A
Evaluating Epoch 20/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.80batch/s][A
Evaluating Epoch 20/25:   8%|██                       | 10/

Epoch [20/25], Eval Accuracy: 0.7780



Epoch 21/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 21/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.713][A
Epoch 21/25:   1%|▏                        | 1/125 [00:01<02:59,  1.45s/batch, Loss=0.713][A
Epoch 21/25:   1%|▏                        | 1/125 [00:02<02:59,  1.45s/batch, Loss=0.423][A
Epoch 21/25:   2%|▍                        | 2/125 [00:02<02:20,  1.14s/batch, Loss=0.423][A
Epoch 21/25:   2%|▍                        | 2/125 [00:03<02:20,  1.14s/batch, Loss=0.481][A
Epoch 21/25:   2%|▌                        | 3/125 [00:03<02:06,  1.04s/batch, Loss=0.481][A
Epoch 21/25:   2%|▌                        | 3/125 [00:04<02:06,  1.04s/batch, Loss=0.656][A
Epoch 21/25:   3%|▊                        | 4/125 [00:04<02:00,  1.01batch/s, Loss=0.656][A
Epoch 21/25:   3%|▊                        | 4/125 [00:05<02:00,  1.01batch/s, Loss=0.426][A
Epoch 21/25:   4%|█                        | 5/125 [00:05<0

Epoch [21/25], Train Loss: 0.0593



Evaluating Epoch 21/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 21/25:   1%|▏                         | 1/125 [00:00<00:44,  2.76batch/s][A
Evaluating Epoch 21/25:   2%|▍                         | 2/125 [00:00<00:44,  2.78batch/s][A
Evaluating Epoch 21/25:   2%|▌                         | 3/125 [00:01<00:43,  2.79batch/s][A
Evaluating Epoch 21/25:   3%|▊                         | 4/125 [00:01<00:43,  2.80batch/s][A
Evaluating Epoch 21/25:   4%|█                         | 5/125 [00:01<00:42,  2.80batch/s][A
Evaluating Epoch 21/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 21/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 21/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 21/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.81batch/s][A
Evaluating Epoch 21/25:   8%|██                       | 10/

Epoch [21/25], Eval Accuracy: 0.7820



Epoch 22/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 22/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.457][A
Epoch 22/25:   1%|▏                        | 1/125 [00:01<02:28,  1.19s/batch, Loss=0.457][A
Epoch 22/25:   1%|▏                        | 1/125 [00:02<02:28,  1.19s/batch, Loss=0.199][A
Epoch 22/25:   2%|▍                        | 2/125 [00:02<02:06,  1.03s/batch, Loss=0.199][A
Epoch 22/25:   2%|▍                        | 2/125 [00:03<02:06,  1.03s/batch, Loss=0.246][A
Epoch 22/25:   2%|▌                        | 3/125 [00:03<01:59,  1.02batch/s, Loss=0.246][A
Epoch 22/25:   2%|▌                        | 3/125 [00:03<01:59,  1.02batch/s, Loss=0.499][A
Epoch 22/25:   3%|▊                        | 4/125 [00:03<01:55,  1.05batch/s, Loss=0.499][A
Epoch 22/25:   3%|▊                         | 4/125 [00:04<01:55,  1.05batch/s, Loss=0.55][A
Epoch 22/25:   4%|█                         | 5/125 [00:04<

Epoch [22/25], Train Loss: 0.0578



Evaluating Epoch 22/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 22/25:   1%|▏                         | 1/125 [00:00<00:44,  2.80batch/s][A
Evaluating Epoch 22/25:   2%|▍                         | 2/125 [00:00<00:43,  2.80batch/s][A
Evaluating Epoch 22/25:   2%|▌                         | 3/125 [00:01<00:43,  2.80batch/s][A
Evaluating Epoch 22/25:   3%|▊                         | 4/125 [00:01<00:43,  2.80batch/s][A
Evaluating Epoch 22/25:   4%|█                         | 5/125 [00:01<00:42,  2.80batch/s][A
Evaluating Epoch 22/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 22/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 22/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 22/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.80batch/s][A
Evaluating Epoch 22/25:   8%|██                       | 10/

Epoch [22/25], Eval Accuracy: 0.7920



Epoch 23/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 23/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.519][A
Epoch 23/25:   1%|▏                        | 1/125 [00:01<02:33,  1.24s/batch, Loss=0.519][A
Epoch 23/25:   1%|▏                        | 1/125 [00:02<02:33,  1.24s/batch, Loss=0.374][A
Epoch 23/25:   2%|▍                        | 2/125 [00:02<02:10,  1.06s/batch, Loss=0.374][A
Epoch 23/25:   2%|▍                         | 2/125 [00:03<02:10,  1.06s/batch, Loss=0.37][A
Epoch 23/25:   2%|▌                         | 3/125 [00:03<02:03,  1.01s/batch, Loss=0.37][A
Epoch 23/25:   2%|▌                        | 3/125 [00:04<02:03,  1.01s/batch, Loss=0.834][A
Epoch 23/25:   3%|▊                        | 4/125 [00:04<01:58,  1.02batch/s, Loss=0.834][A
Epoch 23/25:   3%|▊                        | 4/125 [00:04<01:58,  1.02batch/s, Loss=0.198][A
Epoch 23/25:   4%|█                        | 5/125 [00:04<0

Epoch [23/25], Train Loss: 0.0553



Evaluating Epoch 23/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 23/25:   1%|▏                         | 1/125 [00:00<00:44,  2.80batch/s][A
Evaluating Epoch 23/25:   2%|▍                         | 2/125 [00:00<00:43,  2.80batch/s][A
Evaluating Epoch 23/25:   2%|▌                         | 3/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 23/25:   3%|▊                         | 4/125 [00:01<00:43,  2.81batch/s][A
Evaluating Epoch 23/25:   4%|█                         | 5/125 [00:01<00:42,  2.80batch/s][A
Evaluating Epoch 23/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.81batch/s][A
Evaluating Epoch 23/25:   6%|█▍                        | 7/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 23/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.81batch/s][A
Evaluating Epoch 23/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.81batch/s][A
Evaluating Epoch 23/25:   8%|██                       | 10/

Epoch [23/25], Eval Accuracy: 0.8020



Epoch 24/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 24/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.448][A
Epoch 24/25:   1%|▏                        | 1/125 [00:01<03:13,  1.56s/batch, Loss=0.448][A
Epoch 24/25:   1%|▏                        | 1/125 [00:02<03:13,  1.56s/batch, Loss=0.422][A
Epoch 24/25:   2%|▍                        | 2/125 [00:02<02:25,  1.18s/batch, Loss=0.422][A
Epoch 24/25:   2%|▍                        | 2/125 [00:03<02:25,  1.18s/batch, Loss=0.574][A
Epoch 24/25:   2%|▌                        | 3/125 [00:03<02:09,  1.07s/batch, Loss=0.574][A
Epoch 24/25:   2%|▌                        | 3/125 [00:04<02:09,  1.07s/batch, Loss=0.355][A
Epoch 24/25:   3%|▊                        | 4/125 [00:04<02:02,  1.01s/batch, Loss=0.355][A
Epoch 24/25:   3%|▊                        | 4/125 [00:05<02:02,  1.01s/batch, Loss=0.751][A
Epoch 24/25:   4%|█                        | 5/125 [00:05<0

Epoch [24/25], Train Loss: 0.0546



Evaluating Epoch 24/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 24/25:   1%|▏                         | 1/125 [00:00<00:44,  2.79batch/s][A
Evaluating Epoch 24/25:   2%|▍                         | 2/125 [00:00<00:43,  2.80batch/s][A
Evaluating Epoch 24/25:   2%|▌                         | 3/125 [00:01<00:44,  2.75batch/s][A
Evaluating Epoch 24/25:   3%|▊                         | 4/125 [00:01<00:43,  2.77batch/s][A
Evaluating Epoch 24/25:   4%|█                         | 5/125 [00:01<00:43,  2.79batch/s][A
Evaluating Epoch 24/25:   5%|█▏                        | 6/125 [00:02<00:42,  2.79batch/s][A
Evaluating Epoch 24/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.80batch/s][A
Evaluating Epoch 24/25:   6%|█▋                        | 8/125 [00:02<00:41,  2.80batch/s][A
Evaluating Epoch 24/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.80batch/s][A
Evaluating Epoch 24/25:   8%|██                       | 10/

Epoch [24/25], Eval Accuracy: 0.8060



Epoch 25/25:   0%|                                             | 0/125 [00:00<?, ?batch/s][A
Epoch 25/25:   0%|                                 | 0/125 [00:01<?, ?batch/s, Loss=0.389][A
Epoch 25/25:   1%|▏                        | 1/125 [00:01<02:34,  1.24s/batch, Loss=0.389][A
Epoch 25/25:   1%|▏                         | 1/125 [00:02<02:34,  1.24s/batch, Loss=0.33][A
Epoch 25/25:   2%|▍                         | 2/125 [00:02<02:10,  1.06s/batch, Loss=0.33][A
Epoch 25/25:   2%|▍                        | 2/125 [00:03<02:10,  1.06s/batch, Loss=0.268][A
Epoch 25/25:   2%|▌                        | 3/125 [00:03<02:01,  1.00batch/s, Loss=0.268][A
Epoch 25/25:   2%|▌                        | 3/125 [00:04<02:01,  1.00batch/s, Loss=0.492][A
Epoch 25/25:   3%|▊                        | 4/125 [00:04<01:57,  1.03batch/s, Loss=0.492][A
Epoch 25/25:   3%|▊                        | 4/125 [00:04<01:57,  1.03batch/s, Loss=0.292][A
Epoch 25/25:   4%|█                        | 5/125 [00:04<0

Epoch [25/25], Train Loss: 0.0525



Evaluating Epoch 25/25:   0%|                                  | 0/125 [00:00<?, ?batch/s][A
Evaluating Epoch 25/25:   1%|▏                         | 1/125 [00:00<00:44,  2.76batch/s][A
Evaluating Epoch 25/25:   2%|▍                         | 2/125 [00:00<00:45,  2.70batch/s][A
Evaluating Epoch 25/25:   2%|▌                         | 3/125 [00:01<00:44,  2.72batch/s][A
Evaluating Epoch 25/25:   3%|▊                         | 4/125 [00:01<00:43,  2.75batch/s][A
Evaluating Epoch 25/25:   4%|█                         | 5/125 [00:01<00:43,  2.76batch/s][A
Evaluating Epoch 25/25:   5%|█▏                        | 6/125 [00:02<00:43,  2.77batch/s][A
Evaluating Epoch 25/25:   6%|█▍                        | 7/125 [00:02<00:42,  2.77batch/s][A
Evaluating Epoch 25/25:   6%|█▋                        | 8/125 [00:02<00:42,  2.78batch/s][A
Evaluating Epoch 25/25:   7%|█▊                        | 9/125 [00:03<00:41,  2.79batch/s][A
Evaluating Epoch 25/25:   8%|██                       | 10/

Epoch [25/25], Eval Accuracy: 0.8100





In [None]:
# 检查训练数据
for batch in train_dataloader:
    print("Input IDs shape:", batch["input_ids"]))
    print("Input IDs example:", batch["input_ids"][0])
    print("Attention Mask shape:", batch["attention_mask"].shape)
    print("Attention Mask example:", batch["attention_mask"][0])
    print("Labels shape:", batch["label"].shape)
    print("Labels example:", batch["label"][0])
    break

# 检查验证数据
for batch in eval_dataloader:
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Input IDs example:", batch["input_ids"][0])
    print("Attention Mask shape:", batch["attention_mask"].shape)
    print("Attention Mask example:", batch["attention_mask"][0])
    print("Labels shape:", batch["label"].shape)
    print("Labels example:", batch["label"][0])
    break

# 2. 成分相似性任务
## 2.1. 准备BioBert微调需要的的标注数据集

In [None]:
# -!-!- BACKUP -!-!- 去掉宝拉珍选网站没有评级成分的functions的统一结尾，之后考虑回来进行对比分析！
import pandas as pd

# 读取CSV文件
df = pd.read_csv('../Desktop/Paula_s_Choice/Paula_SUM_LIST.csv')

# 处理'functions'列
df['functions'] = df['functions'].str.replace('We have not yet rated this ingredient because we have not had a chance to review the research on it.', '')

# 保存新的CSV文件
df.to_csv('../Desktop/Paula_s_Choice/Paula_SUM_LIST_NEW.csv', index=False)

In [17]:
import pandas as pd

# 读取CSV文件
df_sum = pd.read_csv('../Desktop/Paula_s_Choice/Paula_SUM_LIST.csv')
df_final = pd.read_csv('../Desktop/Paula_s_Choice/Paula_detail_final_422_prepare.csv')

# 合并两个 DataFrame,保留所有df_sum的行
merged_df = pd.merge(df_sum, df_final, on='ingredient_name', how='left')

# 填充空值
merged_df['description'] = merged_df['description'].fillna('')
merged_df['functions'] = merged_df['functions'].fillna('')
merged_df['glance'] = merged_df['glance'].fillna('')

# 比较三个列,将其合并为一个新的列
merged_df['combined_text'] = merged_df.apply(lambda row: row['description'] if row['functions'] in row['description'] else 
                                            row['description'] + ' ' + row['functions'] if row['functions'] not in row['description'] else
                                            row['description'], axis=1)

# 保存合并后的 DataFrame 到 CSV 文件
merged_df.to_csv('../Desktop/Paula_s_Choice/Paula_embedding_SUMLIST_before_422.csv', index=False)

In [11]:
# 准备最原始的替换性成分组
import pandas as pd

# 读取CSV文件
df = pd.read_csv('../Desktop/DBCosmetic/6903_Ingredients_INFO_After.csv')

# 创建新的DataFrame
pre_similarity = pd.DataFrame(columns=['component1', 'component2', 'similarity'])

# 遍历每一行数据
for index, row in df.iterrows():
    ingredient_name = row['ingredient_name']
    alternatives = row['alternatives']
    
    # 如果alternatives不为空
    if alternatives and isinstance(alternatives, str):
        # 将alternatives字符串按回车符分割成列表
        alt_list = [alt.strip() for alt in alternatives.split('\n') if alt.strip()]
        
        # 为每个替代成分创建一行数据
        for alt in alt_list:
            # 再次检查是否存在逗号,如果存在则按逗号分割
            sub_alts = [sub_alt.strip() for sub_alt in alt.split(',') if sub_alt.strip()]
            for sub_alt in sub_alts:
                pre_similarity = pd.concat([pre_similarity, pd.DataFrame({'component1': [ingredient_name], 'component2': [sub_alt], 'similarity': [1]})], ignore_index=True)

pre_similarity.to_csv('../Desktop/BioBERT/pre_alternatives.csv', index=False)


In [18]:
# 更改相似性
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm import tqdm
import re
import multiprocess as mp

# 读取6903_Ingredients_INFO_After.csv文件
pre_similarity_df = pd.read_csv('../Desktop/T5/pre_alternatives.csv')

# 读取Paula_SUM_LIST.csv文件
paula_sum_list_df = pd.read_csv('../Desktop/Paula_s_Choice/Paula_embedding_SUMLIST_before_422.csv')

# 定义一个函数来处理单个dataframe
def process_dataframe(df):
    standard_ingredients = {name.lower(): name for name in paula_sum_list_df['ingredient_name']}
    df['component1'] = df['component1'].str.lower()
    df['component2'] = df['component2'].str.lower()
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f'Processing {mp.current_process().name}'):
        component1 = row['component1']
        component2 = row['component2']
        
        # 先尝试精确匹配
        if component1 in standard_ingredients:
            df.at[index, 'component1'] = standard_ingredients[component1]
        else:
            # 如果没有精确匹配,再尝试模糊匹配
            best_match1 = max(standard_ingredients.items(), key=lambda x: fuzz.token_set_ratio(x[0], component1))
            if best_match1[1] != component1:
                df.at[index, 'component1'] = best_match1[1]
        
        if component2 in standard_ingredients:
            df.at[index, 'component2'] = standard_ingredients[component2]
        else:
            best_match2 = max(standard_ingredients.items(), key=lambda x: fuzz.token_set_ratio(x[0], component2))
            if best_match2[1] != component2:
                df.at[index, 'component2'] = best_match2[1]
    
    return df

# 创建进程池并处理数据
num_processes = mp.cpu_count() * 2
pool = mp.Pool(processes=num_processes)
pre_similarity_df = pool.apply_async(process_dataframe, args=(pre_similarity_df.copy(),)).get()
pool.close()
pool.join()

pre_similarity_df.to_csv('../Desktop/BioBERT/pre_alternatives2_422.csv', index=False)

python(9505) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(9506) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(9507) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(9508) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before th

In [19]:
import pandas as pd

# 读取两个CSV文件
pre_similarity_df = pd.read_csv('../Desktop/BioBERT/pre_alternatives2_422.csv')
paula_detail_df = pd.read_csv('../Desktop/Paula_s_Choice/Paula_embedding_SUMLIST_before_422.csv')

# 创建一个空列表来存储合并后的数据行
merged_rows = []

for index, row in pre_similarity_df.iterrows():
    component1 = row['component1']
    component2 = row['component2']
    
    # 在paula_detail_df中查找匹配的ingredient_name
    row1 = paula_detail_df[paula_detail_df['ingredient_name'] == component1].iloc[0].to_dict()
    row2 = paula_detail_df[paula_detail_df['ingredient_name'] == component2].iloc[0].to_dict()
    
    # 创建一个新的字典,包含所需的列
    new_row = {
        'ingredient_name1': row1['ingredient_name'],
        'rating1': row1['rating'],
        'functions1': row1['functions'],
        'link1': row1['link'],
        'benefits1': row1['benefits'],
        'categories1': row1['categories'],
        'glance1': row1['glance'],
        'description1': row1['description'],
        'references1': row1['references'],
        'combined_text1': row1['combined_text'],
        'ingredient_name2': row2['ingredient_name'],
        'rating2': row2['rating'],
        'functions2': row2['functions'],
        'link2': row2['link'],
        'benefits2': row2['benefits'],
        'categories2': row2['categories'],
        'glance2': row2['glance'],
        'description2': row2['description'],
        'references2': row2['references'],
        'combined_text2': row2['combined_text']
    }
    
    # 将新的字典添加到列表
    merged_rows.append(new_row)

# 创建合并后的DataFrame
merged_df = pd.DataFrame(merged_rows)

# 保存合并后的DataFrame到新的CSV文件
merged_df.to_csv('../Desktop/BioBERT/pre_alternatives3_422.csv', index=False)

In [20]:
# 检查有没有空的

# 读取CSV文件
df = pd.read_csv('../Desktop/BioBERT/pre_alternatives3_422.csv')

# 检查ingredient_name_1和ingredient_name_2是否存在空值
null_rows = df[(df['ingredient_name1'].isnull()) | (df['ingredient_name2'].isnull())]

# 统计空值行数
null_count = len(null_rows)
print(f"有 {null_count} 行数据存在component1或component2为空值。")

# 打印这些含有空值的数据行
print(null_rows)

有 0 行数据存在component1或component2为空值。
Empty DataFrame
Columns: [ingredient_name1, rating1, functions1, link1, benefits1, categories1, glance1, description1, references1, combined_text1, ingredient_name2, rating2, functions2, link2, benefits2, categories2, glance2, description2, references2, combined_text2]
Index: []


## 2.2. 正式微调
### 法1. 将一对embedding分别作为x、y进行训练（放弃）

In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# 加载数据集
data = pd.read_csv('../Desktop/BioBERT/pre_alternatives3.csv')
ingredient_text1 = data['combined_text1'].tolist()
ingredient_text2 = data['combined_text2'].tolist()

# 加载预训练模型和分词器
tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli")

# 定义LoRA配置
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    # bias="none",
    # target_modules=["encoder.layernorm", "decoder.layernorm", "encoder.layer.*.attention.k_proj", "encoder.layer.*.attention.v_proj", "decoder.layer.*.attention.k_proj", "decoder.layer.*.attention.v_proj", "encoder.layer.*.ffn.intermediate.dense", "decoder.layer.*.ffn.intermediate.dense"],
)

# 将LoRA插入模型
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [10]:
from sklearn.model_selection import train_test_split
import torch

# 打包成元组
X = list(zip(ingredient_text1, ingredient_text2))
print(len(X))
train_data, valid_data = train_test_split(X, test_size=0.2, random_state=42)

# 将训练集和验证集分别编码为输入和输出
train_input_ids = [tokenizer(t[0], padding=True, truncation=True, max_length=512, return_tensors="pt")["input_ids"] for t in train_data]
train_output_ids = [tokenizer(t[1], padding=True, truncation=True, max_length=512, return_tensors="pt")["input_ids"] for t in train_data]

valid_input_ids = [tokenizer(t[0], padding=True, truncation=True, max_length=512, return_tensors="pt")["input_ids"] for t in valid_data]
valid_output_ids = [tokenizer(t[1], padding=True, truncation=True, max_length=512, return_tensors="pt")["input_ids"] for t in valid_data]

# 创建 TensorDataset
train_dataset = TensorDataset(torch.stack(train_input_ids), torch.stack(train_output_ids))
valid_dataset = TensorDataset(torch.stack(valid_input_ids), torch.stack(valid_output_ids))

1334
1334


RuntimeError: stack expects each tensor to be equal size, but got [1, 36] at entry 0 and [1, 25] at entry 1

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# 检查是否可以使用 MPS 设备
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
    print("Using MPS device")
else:
    device = torch.device('cpu')
    print("Using CPU")

# 将模型移到合适的设备上
model = model.to(device)

# 定义优化器和训练超参数
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
batch_size = 8

# 创建 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

# 开始训练
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    total_train_samples = 0
    
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    for batch in train_progress_bar:
        optimizer.zero_grad()
        
        input_ids, output_ids = batch
        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)
        
        output = model(input_ids=input_ids, labels=output_ids)
        loss = output.loss
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        total_train_samples += input_ids.size(0)
        
        train_progress_bar.set_postfix({"Loss": loss.item()})
    
    avg_train_loss = total_train_loss / total_train_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}")
    
    # 进行验证
    model.eval()
    total_valid_loss = 0
    total_valid_samples = 0
    
    for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}/{num_epochs}", unit="batch"):
        input_ids, output_ids = batch
        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)
        
        with torch.no_grad():
            output = model(input_ids=input_ids, labels=output_ids)
            valid_loss = output.loss
        
        total_valid_loss += valid_loss.item()
        total_valid_samples += input_ids.size(0)
    
    avg_valid_loss = total_valid_loss / total_valid_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_valid_loss:.4f}")

# 保存微调后的模型
model.save_pretrained("path/to/finetuned-biobert-nli")

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

### 法2. 相似度 *** 重点 ***

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../Desktop/BioBERT/pre_alternatives3_422.csv')
texts1 = data['combined_text1'].tolist()
texts2 = data['combined_text2'].tolist()
texts = list(zip(texts1, texts2))

# 将数据集分为训练集和验证集
train_texts, val_texts = train_test_split(texts, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")

Training set size: 1067
Validation set size: 267


In [4]:
import pprint
train_texts[0]

(' Functions: Hair Conditioning, Skin Conditioning. We have not yet rated this ingredient because we have not had a chance to review the research on it.',
 'Disodium laureth sulfosuccinate is a cleansing agent found in products such as face wash, bubble bath, and shampoo, as well as other personal care products. It can help boost the foaming properties of such formulas plus enhance the water solubility of other surfactants. Suppliers of this ingredient note its gentleness on skin.\n\nDisodium laureth sulfosuccinate can be sourced naturally (plant derived) or synthetically (lab created). It is described as a clear, colorless to slightly yellowish liquid in raw material form. Technically speaking, it is the disodium salt of an ethoxylated lauryl alcohol half ester of sulfosuccinic acid.\n\nThe 2015 Cosmetic Ingredient Review Expert Panel surveyed 607 personal care products containing disodium laureth sulfosuccinate in concentrations between 0.06% to 2% for leave-on formulas and 0.4% to 1

In [5]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 检查是否可以使用 MPS 设备
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
    print("Using MPS device")
else:
    device = torch.device('cpu')
    print("Using CPU")

tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli")
model = model.to(device)
print(model)

Using MPS device
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [6]:
# 将 LoRA 插入模型
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

# 设置 LoRA 配置
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=1, 
    lora_alpha=1,
    lora_dropout=0.1
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config).to(device)
model.print_trainable_parameters()

print(model)

trainable params: 36,864 || all params: 108,347,136 || trainable%: 0.0340239727241152
PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Lin

In [None]:
# import torch
# from torch.utils.data import DataLoader, Dataset
# from transformers import AdamW, get_linear_schedule_with_warmup
# from tqdm import tqdm

# class TextPairDataset(Dataset):
#     def __init__(self, text_pairs):
#         self.text_pairs = text_pairs

#     def __len__(self):
#         return len(self.text_pairs)

#     def __getitem__(self, idx):
#         text1, text2 = self.text_pairs[idx]
#         # print(text1)
#         # text1 = tokenizer.encode(text1, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#         # text2 = tokenizer.encode(text2,padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#         # print(text1)
#         return text1, text2

# def fine_tune_model(model, train_texts, val_texts, num_epochs=3, batch_size=8, lr=2e-5):
#     train_dataset = TextPairDataset(train_texts)
#     val_dataset = TextPairDataset(val_texts)

#     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

#     optimizer = AdamW(model.parameters(), lr=lr)
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs
#     )

#     for epoch in range(num_epochs):
#         model.train()
#         train_loss = 0
#         train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} (Training)", unit="batch")
#         for sentences_a, sentences_b in train_progress_bar:
#             encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
#             encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

#             sentence_embeddings_a = model(**encoded_input_a).last_hidden_state
#             sentence_embeddings_b = model(**encoded_input_b).last_hidden_state

#             loss = -torch.mean(torch.cosine_similarity(sentence_embeddings_a, sentence_embeddings_b, dim=1))
#             loss.backward()
#             optimizer.step()
#             scheduler.step()
#             optimizer.zero_grad()
#             train_loss += loss.item()
#             train_progress_bar.set_postfix({"Train Loss": train_loss / (train_progress_bar.n + 1)})

#         model.eval()
#         val_loss = 0
        
#         val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} (Validation)", unit="batch")
#         for sentences_a, sentences_b in val_progress_bar:
#             encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
#             encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

#             with torch.no_grad():
#                 sentence_embeddings_a = model(**encoded_input_a).last_hidden_state
#                 sentence_embeddings_b = model(**encoded_input_b).last_hidden_state
#             loss = -torch.mean(torch.cosine_similarity(sentence_embeddings_a, sentence_embeddings_b, dim=-1))
#             val_loss += loss.item()
#             val_progress_bar.set_postfix({"Val Loss": val_loss / (val_progress_bar.n + 1)})

#         print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_dataloader):.4f}, Val Loss: {val_loss/len(val_dataloader):.4f}")

#     return model

# fine_tune_model(model, train_texts, val_texts)

Epoch 1/3 (Training):   0%|                                    | 0/134 [00:00<?, ?batch/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/3 (Training): 100%|████| 134/134 [5:10:54<00:00, 139.21s/batch, Train Loss=-0.305]
Epoch 1/3 (Validation): 100%|█████████| 34/34 [50:01<00:00, 88.29s/batch, Val Loss=-0.508]


Epoch 1/3, Train Loss: -0.3052, Val Loss: -0.5084


Epoch 2/3 (Training):  13%|▋    | 18/134 [2:14:07<2:47:10, 86.47s/batch, Train Loss=-0.31]

In [None]:
# 修正了一些细节

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

class TextPairDataset(Dataset):
    def __init__(self, text_pairs):
        self.text_pairs = text_pairs

    def __len__(self):
        return len(self.text_pairs)

    def __getitem__(self, idx):
        text1, text2 = self.text_pairs[idx]
        return text1, text2

def fine_tune_model(model, train_texts, val_texts, num_epochs=3, batch_size=8, lr=5e-6, accumulate_grad_batches=1):
    train_dataset = TextPairDataset(train_texts)
    val_dataset = TextPairDataset(val_texts)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    # optimizer = AdamW(model.parameters(), lr=lr)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)  # 添加 L2 正则化
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs // accumulate_grad_batches
    )

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} (Training)", unit="batch")
        for i, (sentences_a, sentences_b) in enumerate(train_progress_bar):
            encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
            encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

            sentence_embeddings_a = model(**encoded_input_a).last_hidden_state
            sentence_embeddings_b = model(**encoded_input_b).last_hidden_state

            loss = torch.mean(1 - torch.cosine_similarity(sentence_embeddings_a, sentence_embeddings_b, dim=1))
            loss = loss / accumulate_grad_batches
            loss.backward()
            if (i + 1) % accumulate_grad_batches == 0 or (i + 1) == len(train_dataloader):
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            train_loss += loss.item() * accumulate_grad_batches
            train_progress_bar.set_postfix({"Train Loss": train_loss / ((i + 1) * batch_size)})
            wandb.log({"train_loss": loss.item()})

        model.eval()
        val_loss = 0
        val_cos_sim = 0
        val_acc = 0
        val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} (Validation)", unit="batch")
        for sentences_a, sentences_b in val_progress_bar:
            encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
            encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

            with torch.no_grad():
                sentence_embeddings_a = model(**encoded_input_a).last_hidden_state
                sentence_embeddings_b = model(**encoded_input_b).last_hidden_state
                cos_sim = torch.cosine_similarity(sentence_embeddings_a, sentence_embeddings_b, dim=-1)
                val_cos_sim += cos_sim.sum().item()
                val_acc += (cos_sim >= 0.7).sum().item()
            loss = torch.mean(1 - cos_sim)
            val_loss += loss.item()
            val_progress_bar.set_postfix({"Val Loss": val_loss / (val_progress_bar.n + 1)})
            wandb.log({"val_loss": loss.item()})

        val_cos_sim /= len(val_dataset)
        val_acc /= len(val_dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_dataloader):.4f}, Val Loss: {val_loss/len(val_dataloader):.4f}, Val Cos Sim: {val_cos_sim:.4f}, Val Acc: {val_acc:.4f}")
        wandb.log({"val_cos_sim": val_cos_sim, "val_acc": val_acc})

    wandb.finish()
    return model

fine_tune_model(model, train_texts, val_texts, accumulate_grad_batches=4)

In [6]:
# 保存并上传
model.save_pretrained("./myModel/")
# model.push_to_hub("my_awesome_peft_model")
from huggingface_hub import push_to_hub

# 推送模型到 Hugging Face Hub
push_to_hub(model, "path/to/save/model", "your-username/your-model-repo-name")

In [10]:
# from huggingface_hub import push_to_hub
model.push_to_hub("Autumn/biobert-cosmetic-ingredients-similarity")

README.md:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/154k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Autumn/biobert-cosmetic-ingredients-similarity/commit/b8140e5bedfa42586af5baef80346969d739601c', commit_message='Upload model', commit_description='', oid='b8140e5bedfa42586af5baef80346969d739601c', pr_url=None, pr_revision=None, pr_num=None)

## 2.3 验证集对比原模型

In [15]:
model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=1, bias=False)
                  )
                 

In [8]:
# 导入必要的库
import torch
from tqdm import tqdm

model_origin = AutoModel.from_pretrained("gsarti/biobert-nli")
model_origin = model_origin.to(device)
print(model_origin)

def evaluate_original_model(model_origin, val_texts, tokenizer, device):
    """评估原始模型在验证集上的效果"""
    val_dataset = TextPairDataset(val_texts)
    val_dataloader = DataLoader(val_dataset, batch_size=8)

    model_origin.eval()
    val_loss = 0
    val_cos_sim = 0
    val_acc = 0
    with torch.no_grad():
        val_progress_bar = tqdm(val_dataloader, desc="Evaluating Original Model", unit="batch")
        for sentences_a, sentences_b in val_progress_bar:
            encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
            encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

            sentence_embeddings_a = model_origin(**encoded_input_a).last_hidden_state
            sentence_embeddings_b = model_origin(**encoded_input_b).last_hidden_state
            cos_sim = torch.cosine_similarity(sentence_embeddings_a, sentence_embeddings_b, dim=-1)
            val_cos_sim += cos_sim.sum().item()
            val_acc += (cos_sim >= 0.7).sum().item()

            loss = torch.mean(1 - cos_sim)
            val_loss += loss.item()
            val_progress_bar.set_postfix({"Val Loss": val_loss / (val_progress_bar.n + 1)})

    val_cos_sim /= len(val_dataset)
    val_acc /= len(val_dataset)
    print(f"Original Model: Val Loss: {val_loss/len(val_dataloader):.4f}, Val Cos Sim: {val_cos_sim:.4f}, Val Acc: {val_acc:.4f}")

    return val_loss/len(val_dataloader), val_cos_sim, val_acc

# 在新的 Jupyter Notebook 中调用评估函数
evaluate_original_model(model_origin, val_texts, tokenizer, device)

python(32070) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32071) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32072) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32073) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32074) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

Evaluating Original Model:   0%|                                | 0/34 [00:00<?, ?batch/s]python(32075) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32077) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32078) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32079) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32080) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32083) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Evaluating Original Model:   3%|▏       | 1/34 [00:16<09:13, 16.76s/batch, Val Loss=0.483]python(32085) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32086) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(32087) MallocStackLogging: ca

Original Model: Val Loss: 0.4834, Val Cos Sim: 263.5462, Val Acc: 87.9700





(0.4834236730547512, 263.5462100139718, 87.97003745318352)

## 2.4. 下游任务

In [None]:
# 任务1. 计算固定每个成分间的相似度，根据相似度给出固定长度的link（之后做Neo4J 固定布局的图可视化，可能是圆形的结构？）

# 任务2. 使用模型讲用户输入的自然语言进行embedding，找到最匹配他的成分语言，给出对应相似成分的预测（）

#### 000 测试 000

In [31]:
print(model)
for name, param in model.named_parameters():
    print(name, param.shape)

# 打印一个样本输入和输出的形状
sample_text1, sample_text2 = train_texts[0]
print(f"Sample text1 shape: {sample_text1.shape}")
print(f"Sample text2 shape: {sample_text2.shape}")

outputs1 = model(sample_text1)
outputs2 = model(sample_text2)
print(f"Model output1 shape: {outputs1.shape}")
print(f"Model output2 shape: {outputs2.shape}")

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=1, bias=False)
                  )
                 

AttributeError: 'str' object has no attribute 'shape'

In [86]:
sentences_a, sentences_b = train_texts[:2]
print(len(sentences_a), len(sentences_b))
encoded_input_a = tokenizer(sentences_a, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
encoded_input_b = tokenizer(sentences_b, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

with torch.no_grad():
    outputs_a = model(**encoded_input_a)
    sentence_embeddings_a = outputs_a.last_hidden_state
    outputs_b = model(**encoded_input_b)
    sentence_embeddings_b = outputs_b.last_hidden_state

print(f"Model output1 shape: {sentence_embeddings_a.shape}")
print(f"Model output2 shape: {sentence_embeddings_b.shape}")
sentence_embeddings_a

2 2
Model output1 shape: torch.Size([2, 512, 768])
Model output2 shape: torch.Size([2, 512, 768])


tensor([[[ 0.8280, -0.9024,  0.1995,  ..., -0.3143, -0.1624,  0.9020],
         [ 0.9236, -0.4079, -0.0779,  ...,  0.4532, -0.5143,  0.3333],
         [ 0.5195, -0.7559,  0.2349,  ...,  0.6782, -0.2863,  0.1701],
         ...,
         [ 0.4462, -0.9198, -0.6070,  ...,  0.3245,  0.4255,  1.0480],
         [ 0.8107, -1.1048, -0.1713,  ...,  0.8964,  0.2540,  0.7170],
         [ 0.0654, -0.7277,  0.3448,  ...,  0.4545, -0.1554,  1.1606]],

        [[-1.3981, -0.0845,  0.1867,  ..., -0.8284,  0.6307,  0.5061],
         [-0.5423,  0.0880,  0.7182,  ..., -0.1797,  0.0468,  1.1006],
         [-0.9348,  0.1268,  0.0450,  ...,  0.1030, -0.1996,  0.3013],
         ...,
         [-0.3146, -0.5538, -0.2006,  ..., -0.1527,  0.1480,  0.1084],
         [-0.7362, -0.2626, -0.0669,  ..., -0.3275,  0.6866,  0.7149],
         [-1.4319,  0.0186, -0.1068,  ..., -0.7160,  0.5469,  0.6057]]],
       device='mps:0')

# 3. 杂项
## 3.1. gsarti/biobert-nli基本测试

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli")

sentences_a = [
    "This is the first sentence OH MY GOD!",
    "Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes.",
    "This is the third sentence."
]

sentences_b = [
    "That is the first sentence!",
    "Last year,studies have not shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes. Recent studies have shown that the dysregulation of the PI3K/AKT/mTOR signaling pathway plays a crucial role in the development and progression of various diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases, making it an attractive target for therapeutic interventions. However, the complexity and interconnectedness of this pathway pose challenges for the design of effective targeted therapies. Therefore, further research is needed to unravel the intricate mechanisms underlying the dysregulation of this pathway and to develop novel therapeutic strategies that can selectively modulate its components to achieve optimal clinical outcomes.",
    "This is the fifth sentence, OH MY GOD."
]

encoded_input_a = tokenizer(sentences_a, padding=True, truncation=True, max_length=512, return_tensors="pt")
encoded_input_b = tokenizer(sentences_b, padding=True, truncation=True, max_length=512, return_tensors="pt")

with torch.no_grad():
    outputs_a = model(**encoded_input_a)
    sentence_embeddings_a = outputs_a.last_hidden_state
    outputs_b = model(**encoded_input_b)
    sentence_embeddings_b = outputs_b.last_hidden_state

In [2]:
sentence_embeddings_a.shape, sentence_embeddings_b.shape

(torch.Size([3, 512, 768]), torch.Size([3, 512, 768]))

In [12]:
outputs_a

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.8171e-01, -4.0024e-01,  5.6180e-01,  ...,  7.3958e-01,
           8.0139e-01,  1.1355e+00],
         [-9.8806e-02, -2.4155e-01, -1.1370e-01,  ...,  1.2160e+00,
           5.5963e-01,  1.3576e+00],
         [ 3.4458e-01, -3.8337e-01, -9.2877e-02,  ...,  1.1780e+00,
           5.2670e-01,  1.2173e+00],
         ...,
         [ 2.3720e-01, -6.5993e-01,  3.5448e-01,  ...,  1.0120e+00,
           1.2228e+00,  1.4106e+00],
         [ 2.2951e-01, -6.8032e-01,  2.9290e-01,  ...,  1.0224e+00,
           1.3131e+00,  1.4847e+00],
         [-1.8171e-01, -4.0024e-01,  5.6180e-01,  ...,  7.3958e-01,
           8.0139e-01,  1.1355e+00]],

        [[-4.7639e-01,  8.0986e-01, -7.7809e-01,  ..., -1.0216e+00,
           4.5359e-01, -3.7967e-02],
         [-5.1692e-01,  1.0944e+00, -2.5708e-01,  ..., -6.3311e-01,
           3.4273e-01,  3.5301e-01],
         [-7.6363e-01,  1.2261e+00, -1.9415e-01,  ..., -5.1874e-01,
           6.

In [6]:
import torch.nn.functional as F

# 定义相似度指标函数
def sim_metric(output1, output2):
    return F.cosine_similarity(output1, output2, dim=-1).mean()

sim_metric(sentence_embeddings_a, sentence_embeddings_b)

tensor(0.6890)

### 大模型的属性

In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli")

# Check model attributes
# print(dir(model))

# 打印更多模型属性
print(f"model.config: {model.config}")
print(f"model.base_model: {model.base_model}")
print(f"model.encoder: {model.encoder}")

model.config: BertConfig {
  "_name_or_path": "gsarti/biobert-nli",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "eos_token_ids": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

model.base_model: BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)

In [6]:
# 加载预训练模型
tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli")

# 打印模型可接受的参数
print(model.forward.__code__.co_varnames)

('self', 'input_ids', 'attention_mask', 'token_type_ids', 'position_ids', 'head_mask', 'inputs_embeds', 'encoder_hidden_states', 'encoder_attention_mask', 'past_key_values', 'use_cache', 'output_attentions', 'output_hidden_states', 'return_dict', 'input_shape', 'batch_size', 'seq_length', 'device', 'past_key_values_length', 'buffered_token_type_ids', 'buffered_token_type_ids_expanded', 'extended_attention_mask', 'encoder_batch_size', 'encoder_sequence_length', '_', 'encoder_hidden_shape', 'encoder_extended_attention_mask', 'embedding_output', 'encoder_outputs', 'sequence_output', 'pooled_output')


# 3.2. LoRA参数
### 基本参数

In [33]:
from peft import LoraConfig, TaskType

# 设置LoRA配置
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
)

In [32]:
for taskType in TaskType: 
    print(taskType)

TaskType.SEQ_CLS
TaskType.SEQ_2_SEQ_LM
TaskType.CAUSAL_LM
TaskType.TOKEN_CLS
TaskType.QUESTION_ANS
TaskType.FEATURE_EXTRACTION


- TaskType.SEQ_CLS: 序列分类任务,即给定一个输入序列,预测整个序列的类别标签。例如情感分析、主题分类等。
- TaskType.***SEQ_2_SEQ_LM***: 序列到序列的语言模型任务,即给定一个输入序列,生成一个输出序列。例如机器翻译、摘要生成等。
- TaskType.CAUSAL_LM: 是因果语言模型任务,即给定前一个词,预测下一个词。这种模型通常用于生成任务,如文本续写、对话生成等。
- TaskType.TOKEN_CLS: token分类任务,即给定一个输入序列,对序列中的每个token进行分类。例如命名实体识别、词性标注等。
- TaskType.QUESTION_ANS: 问答任务,即给定一个问题和相关的背景文本,预测问题的答案。
- TaskType.FEATURE_EXTRACTION: 这不是一个具体的任务类型,而是用于表示语言模型可以作为特征提取器使用的通用类型。

### 对比大模型引入LoRA前后需要训练参数数量

In [34]:
from transformers import AutoModelForSequenceClassification

# 检查是否可以使用 MPS 设备
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
    print("Using MPS device")
else:
    device = torch.device('cpu')
    print("Using CPU")
    
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# 计算可训练参数数量
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'原始大模型的可训练参数数量: {trainable_params}')

Using MPS device


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gsarti/biobert-nli and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


原始大模型的可训练参数数量: 108311810


In [35]:
# 将A、B矩阵插入大模型
from peft import get_peft_model
model = get_peft_model(model, lora_config).to(device)

# 计算可训练参数数量
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'引入LoRA后大模型的可训练参数数量为: {trainable_params}')

引入LoRA后大模型的可训练参数数量为: 296450


## 3.3 Wandb测试

In [None]:
import wandb
api = wandb.Api()

run = api.run("autumndyer/BioBERT_LoRA/<run_id>")
run.config["key"] = updated_value
run.update()

In [1]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="BioBERT_LoRA_similarity",
    
    # track hyperparameters and run metadata
    config={
    # "learning_rate": 0.02,
    "architecture": "BioBERT",
    # "dataset": "CIFAR-100",
    "epochs": 3,
    }
)

# # simulate training
# epochs = 10
# offset = random.random() / 5
# for epoch in range(2, epochs):
#     acc = 1 - 2 ** -epoch - random.random() / epoch - offset
#     loss = 2 ** -epoch + random.random() / epoch + offset
    
#     # log metrics to wandb
#     wandb.log({"acc": acc, "loss": loss})
    
# # [optional] finish the wandb run, necessary in notebooks
# wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33mautumndyer[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168125921782728, max=1.0…