<a href="https://colab.research.google.com/github/SparKgod1/Skills-and-Expertise/blob/main/Lora%EF%BC%9AFintune_bloom_1b1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Downloading Dependencies


In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git
# !pip install --upgrade peft
# !pip install --upgrade transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.tom

## Loading Pre-Trained Model


In [None]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

#loading model
model = AutoModelForCausalLM.from_pretrained(
    # "bigscience/bloom-3b",
    "bigscience/bloom-1b1",
    # "bigscience/bloom-560m",
    # torch_dtype=torch.float16,
    device_map='auto',
    cache_dir='/content/drive/MyDrive/mdl'
)

#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
special_tokens = {'additional_special_tokens':['<|beginofutterance|>', '<|endofutterance|>']}
tokenizer.add_special_tokens(special_tokens)

# 设置结束标记字符串
end_token_str = "<|endofutterance|>"

# 将结束标记添加到tokenizer的词汇表中
tokenizer.add_tokens([end_token_str])

# 获取结束标记的token id
end_token_id = tokenizer.convert_tokens_to_ids(end_token_str)

# 在模型中设置结束标记的token id
model.config.eos_token_id = end_token_id

## Setting up LoRA


In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, config)

## Printing Trainable Parameter Difference

In [None]:
trainable_params = 0
all_param = 0

for _, param in peft_model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable params: {trainable_params}")
print(f"all params: {all_param}")
print(f"trainable: {100 * trainable_params / all_param:.2f}%")

trainable params: 1179648
all params: 1066493952
trainable: 0.11%


## Loading Dataset

In [None]:
import json

from datasets import load_dataset

json_file_path = '/content/drive/MyDrive/HC3_Chinese_ChatGPT.json'

qa_dataset = load_dataset('json', data_files=json_file_path)

print(qa_dataset)

torch.cuda.empty_cache()

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 17925
    })
})


## Re-Formatting


In [None]:
def create_prompt(context, question, answer):
  prompt_template = f"<|beginofutterance|>系统\n{context}\n<|endofutterance|>\n<|beginofutterance|>用户\n{question}\n<|endofutterance|>\n<|beginofutterance|>智能助手\n{answer}<|endofutterance|>\n"
  return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['instruction'], samples['input'], samples['output'])))

## Training our LoRA model


In [None]:
import transformers

torch.cuda.empty_cache()
trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=100,
        max_steps=1000,
        # num_train_epochs=10,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=100,
        output_dir='/content/drive/MyDrive/outputs_1',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
torch.cuda.empty_cache()
# peft_model.config.use_cache = False  # silence the warnings.
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,3.8533
200,2.9246
300,2.7847
400,2.7284
500,2.707
600,2.6232
700,2.5771
800,2.6102
900,2.6141
1000,2.5339


TrainOutput(global_step=1000, training_loss=2.7956449127197267, metrics={'train_runtime': 144.871, 'train_samples_per_second': 6.903, 'train_steps_per_second': 6.903, 'total_flos': 557438661697536.0, 'train_loss': 2.7956449127197267, 'epoch': 0.06})

## Saving Locally


In [None]:
model_id = "BLOOM-1b1-LoRA_1800steps"
peft_model.save_pretrained("/content/drive/MyDrive/BLOOM1111111111")

## Merge model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b1",
    device_map="auto",
    torch_dtype=torch.float16,
)
# apply and merge adapter 1
model = PeftModel.from_pretrained(
    model,
    "/content/drive/MyDrive/BLOOM1111111111",
    torch_dtype=torch.float16,
)
model = model.merge_and_unload()
model.save_pretrained('/content/drive/MyDrive/_1')

## test

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.cuda.empty_cache()
model_raw = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b1",
    device_map="auto",
    torch_dtype=torch.float16,
)
model_finetuned = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/model_finetuened",
    device_map="auto",
    torch_dtype=torch.float16,
)

model_finetuned = model_finetuned.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/tokenizer")
special_tokens = {'additional_special_tokens':['<|beginofutterance|>', '<|endofutterance|>']}
tokenizer.add_special_tokens(special_tokens)
tokenizer.save_pretrained("/content/drive/MyDrive/tokenizer")
# 设置结束标记字符串
end_token_str = "<|endofutterance|>"

# 将结束标记添加到tokenizer的词汇表中
tokenizer.add_tokens([end_token_str])

# 获取结束标记的token id
end_token_id = tokenizer.convert_tokens_to_ids(end_token_str)

# 在模型中设置结束标记的token id
model_finetuned.config.eos_token_id = end_token_id
model_raw.config.eos_token_id = end_token_id


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
context = "请回答下面的问题"
question = "哈工大有多少杰出人才"
batch = tokenizer(f"\n<|beginofutterance|>系统\n{context}\n<|endofutterance|>\n<|beginofutterance|>用户\n{question}\n<|endofutterance|>\n<|beginofutterance|>智能助手\n", return_tensors='pt', return_token_type_ids=False)
batch = batch.to("cuda")
# 对原始预训练模型生成回答
outputs_raw = model_raw.generate(**batch, max_length=100, eos_token_id=tokenizer.eos_token_id)
answer_raw = tokenizer.decode(outputs_raw[0], skip_special_tokens=True)
print("原始预训练模型回答:", answer_raw)

# 对微调模型生成回答
outputs_finetuned = model_finetuned.generate(**batch, max_length=100, eos_token_id=tokenizer.eos_token_id)
answer_finetuned = tokenizer.decode(outputs_finetuned[0], skip_special_tokens=True)
print("微调模型回答:", answer_finetuned)

原始预训练模型回答: 
系统
请回答下面的问题

用户
哈工大有多少杰出人才

智能助手
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问一下
问
微调模型回答: 
系统
请回答下面的问题

用户
哈工大有多少杰出人才

智能助手
哈工大是我国著名的高等院校，拥有众多的杰出人才。哈工大在人才培养方面具有独特的优势，在人才培养方面具有丰富的经验。哈工大在人才培养方面具有独特的优势，在人才培养方面具有丰富的经验。在哈工大，学生可以获得系统的教育，包括基础教育、专业教育、职业教育、
