*Step.1 導入相關套件*

In [None]:
!pip install bitsandbytes==0.42.0

In [None]:
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import notebook_login
notebook_login("")

In [None]:
!pip install datasets

In [6]:
from datasets import Dataset , load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

*Step.2 載入數據*

In [None]:
ds = Dataset.load_from_disk("./alpaca_data_zh/")
ds

In [None]:
ds[:3]

*Step.3 數據前處理*

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer

In [12]:
tokenizer.pad_token_id = 2

In [13]:
def process_func(example):
    MAX_LENGTH = 1024 # Llama會將中文切成多個token,因此需要增大長度,確保數據完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: " , add_special_tokens=False)
    response = tokenizer(example["output"], add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

In [39]:
print(tokenized_ds[0]["input_ids"])

In [None]:
tokenizer.decode(tokenized_ds[1]["input_ids"])

In [38]:
# tokenizer("哈" , add_special_tokens=False) # Llama會將中文切成多個token,因此需要增大長度,確保數據完整性

In [None]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"])))

*Step.4 模型建立*

In [41]:
import torch
# 多張顯卡可以去掉device_map="auto"，避免模型拆分
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
                        low_cpu_mem_usage=True,
                        torch_dtype=torch.bfloat16,
                        device_map="auto",
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.bfloat16,
                        bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
for name, param in model.named_parameters():
    print(name, param.shape, param.dtype)

In [19]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false

# Lora
# PEFT Step1 配置文件

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(task_type=TaskType.CAUSAL_LM,)
config

# PEFT Step2 建立模型

In [30]:
model = get_peft_model(model, config)

In [31]:
model.enable_input_require_grads() # <-- 如果在args要用gradient_checkpoint要執行

In [None]:
# model = model.half() 全模型fp16 , args中adam_epsilon要設定大一點

In [None]:
# 觀察模型正向傳播的收斂情況
# from torch.utils.data import DataLoader
# dl = DataLoader(tokenized_ds , batch_size = 2 ,collate_fn = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))
# ipt = next(enumerate(dl))[1]
# ipt

In [None]:
config

In [34]:
model.print_trainable_parameters()

*Step.5 訓練參數*

In [None]:
args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    logging_steps=10,
    num_train_epochs=1,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit"
)

*Step.6 訓練器*

In [None]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset=tokenized_ds.select(range(6000)),
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),

)

*Step.7 模型訓練*

In [None]:
trainer.train()

In [None]:
model.eval()
ipt = tokenizer("Human: {}\n{}".format("你好", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=512, do_sample=True, eos_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True)