<a href="https://colab.research.google.com/github/WilsLoki/test/blob/main/Fine_Tuning_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install torch
!pip install peft

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [2]:
import numpy as np
import evaluate
import torch
from datasets import load_dataset, DatasetDict, Dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

## Dataset

In [3]:
# 构建数据（训练集+验证集）

imdb_dataset = load_dataset("stanfordnlp/imdb")

# 定义样本大小
N = 1000

# 生成随机索引
rand_idx = np.random.randint(24999, size=N)

# 提取训练数据（标签+文本）
x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

# 提取测试数据
x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

# 创建新的数据集
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# 验证数据（计算训练数据集中标签为 1 的比例）
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.547

## Model

In [5]:
# 加载初始模型

# 选择预训练模型
model_checkpoint = 'distilbert-base-uncased'

# 定义标签映射（实现数字和文本之间的相互转换）
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# 创建模型
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 显示模型架构
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Preprocessing

In [7]:
# 创建分词器
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# 添加填充标记
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# 定义分词函数

def tokenize_function(examples):
    # 提取文本
    text = examples["text"]
    # 设置截断方向为左侧
    tokenizer.truncation_side = "left"
    # 分词处理
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512,  # 设置序列的最大长度为512，超过这个长度的文本会被截断
        padding='max_length' # 对短文本进行填充，填充到max_length指定的长度，使所有文本长度一致
    )

    # 返回统一长度的numpy数组
    return tokenized_inputs

In [9]:
# 对数据集中的样本执行分词处理 tokenize_function
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [10]:
# 定义准确率函数

metrics = evaluate.load("accuracy")

# 定义计算准确率的函数（评估时计算准确率）
def compute_accuracy(model, dataset, tokenizer):

    # 将模型设置为评估模式
    model.eval()

    # 初始化准确率评估指标 accuracy metric
    accuracy_metric = evaluate.load("accuracy")

    # 从dataset中提取input_ids、attention_mask和labels，并转换为tensor
    input_ids = torch.tensor(dataset['input_ids']).to(model.device)
    attention_mask = torch.tensor(dataset['attention_mask']).to(model.device)
    labels = torch.tensor(dataset['label']).to(model.device)

    # 不计算梯度，提升计算效率
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        # 更新准确率评估指标 accuracy metric
        accuracy_metric.add_batch(predictions=predictions, references=labels)

    # 计算并返回最终准确率
    accuracy = accuracy_metric.compute()["accuracy"]
    return accuracy

# 计算验证集上的准确率（验证函数正常运行&测试未训练模型表现）
subset_size = 100
subset_indices = range(subset_size)
subset_data = tokenized_dataset["validation"].select(subset_indices)
accuracy_result = compute_accuracy(model, subset_data, tokenizer)
print("Validation Accuracy:", accuracy_result)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Validation Accuracy: 0.42


In [11]:
# 重新初始化模型
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# 创建数据整理器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
# 定义评估函数

# 加载准确率评估指标
accuracy = evaluate.load("accuracy")

# 定义评估函数（训练中计算准确率）
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [14]:
# 加载未训练的模型，用于对比
model_untrained = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

# 测试几个示例文本
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

# 使用未训练模型进行预测
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model_untrained(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


## Train Model

In [15]:
# 配置LoRA参数
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=16,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        target_modules = ['q_lin', 'v_lin'])

print(f"lora_dropout的值是: {peft_config.lora_dropout}")
print(f"target_modules的值是: {peft_config.target_modules}")

lora_dropout的值是: 0.05
target_modules的值是: {'v_lin', 'q_lin'}


In [16]:
# 打印配置
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules={'v_lin', 'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [17]:
# 将模型与LoRA配置结合
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # 打印可训练的参数数量

trainable params: 887,042 || all params: 67,842,052 || trainable%: 1.3075


In [18]:
# 设置超参数
lr = 1e-3
batch_size = 4
num_epochs = 10

In [19]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    #warmup_ratio = 0,
    #lr_scheduler_type = "cosine",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.03,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

print(f"Learning Rate: {training_args.learning_rate}")
#print(f"Warm up Ratio: {training_args.warmup_ratio}")
#print(f"Learning Rate Scheduler Type: {training_args.lr_scheduler_type}")
print(f"Weight Decay: {training_args.weight_decay}")
print(f"Train Epochs: {training_args.num_train_epochs}")
print(f"Batch Size: {training_args.per_device_train_batch_size}")

Learning Rate: 0.001
Weight Decay: 0.03
Train Epochs: 10
Batch Size: 4




In [20]:
# 创建训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m2441337048[0m ([33m2441337048-company[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.540147,{'accuracy': 0.853}
2,0.439200,0.472505,{'accuracy': 0.862}
3,0.439200,0.483676,{'accuracy': 0.878}
4,0.230600,0.710353,{'accuracy': 0.876}
5,0.230600,0.941337,{'accuracy': 0.881}
6,0.044800,0.952836,{'accuracy': 0.883}
7,0.044800,0.981899,{'accuracy': 0.885}
8,0.034800,1.002099,{'accuracy': 0.884}
9,0.034800,1.039562,{'accuracy': 0.885}
10,0.005300,1.013558,{'accuracy': 0.884}


Trainer is attempting to log a value of "{'accuracy': 0.853}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.862}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.878}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.876}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.15092533054351806, metrics={'train_runtime': 597.3311, 'train_samples_per_second': 16.741, 'train_steps_per_second': 4.185, 'total_flos': 1351923916800000.0, 'train_loss': 0.15092533054351806, 'epoch': 10.0})

### Generate Prediction

In [21]:
# 对示例文本生成预测（和未训练模型的output对比）

model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")
    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


## Hugging Face

In [22]:
# 登录Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
# 上传模型至 Hugging Face Hub
hf_name = 'WillLoki'
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification"
model.push_to_hub(model_id) # 保存模型
trainer.push_to_hub(model_id) # 保存训练器

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67c7ba06-0a7e2d1b10a2527b797c9138;d3070495-e204-462f-b76b-750854c0c362)

Invalid username or password.

In [None]:
# 从 Hugging Face Hub 加载模型配置

config = PeftConfig.from_pretrained(model_id)
# 加载模型
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# 加载LoRA微调后的模型
model = PeftModel.from_pretrained(inference_model, model_id)

In [None]:
# 模型推理（验证上传到HF后的模型是否能够正确地进行推理)
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")
    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])