In [None]:
import os

os.environ["HF_HOME"] = "/root/autodl-tmp/HF_download"
os.environ["MODELSCOPE_CACHE"] = "/root/autodl-tmp/MODELSCOPE_download"
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [None]:
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/MODELSCOPE_download/models/modelscope/Llama-2-7b-ms", trust_remote_code=True)

In [None]:
from datasets import Dataset

datasets = Dataset.load_from_disk("/root/autodl-tmp/code/test-transformers/data/alpaca_data_zh")

In [None]:
tokenizer.padding_side = "right"

In [None]:
tokenizer.pad_token_id = 2

In [None]:
def process_function(example):
  MAX_LENGTH = 1024
  instruction = tokenizer("\n".join([f"Human: " + example["instruction"].strip(), example["input"].strip()]).strip() + "\n\nAssistant: ", add_special_tokens=False)
  response = tokenizer(example["output"].strip(), add_special_tokens=False)
  input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
  attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
  labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]
  if len(input_ids) > MAX_LENGTH:
    return {}
  return {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "labels": labels
  }

In [None]:
tokenized_datasets = datasets.map(process_function, remove_columns=datasets.column_names)

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_compute_dtype=torch.bfloat16,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
  "/root/autodl-tmp/MODELSCOPE_download/models/modelscope/Llama-2-7b-ms",
  dtype=torch.bfloat16,
  trust_remote_code=True,
  low_cpu_mem_usage=True,
  device_map="auto",
  quantization_config=bnb_config
)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
  task_type=TaskType.CAUSAL_LM
)

In [None]:
model = get_peft_model(model, config)

In [None]:
for name, module in model.named_modules():
  if "lora_" in name:
    module.to(torch.bfloat16)

In [None]:
model.print_trainable_parameters()

In [None]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/root/autodl-tmp/code/test-transformers/test-kbit/4bit-training/llama-chatbot",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    report_to=["tensorboard"],
    optim="paged_adamw_32bit",
    # 关闭14G显存，开启7G显存
    gradient_checkpointing=True,
    # adam_epsilon=1e-4, # 如果使用fp16的需要调大精度范围防止溢出
)

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets.select(range(6000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
pipe("Human: {}\n{}".format("保持健康的三个提示。", "").strip() + "\n\nAssistant: ")

In [None]:
pipe("Human: 保持健康的三个提示。\n\nAssistant:  ")

In [None]:
model.gradient_checkpointing_disable()

In [None]:
model.eval()
ipt = tokenizer("Human: {}\n{}".format("保持健康的三个提示。", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=256, do_sample=True, eos_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True)

In [None]:
tokenizer.decode(tokenized_datasets["input_ids"][0])