## 安装依赖

In [None]:
pip install -q -U accelerate==0.23.0 peft==0.5.0 bitsandbytes==0.41.1 transformers==4.34 trl==0.7.2

## 微调

In [1]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    CodeLlamaTokenizer,
    default_data_collator, 
    Trainer, 
    TrainingArguments,
    TrainerCallback,
    BitsAndBytesConfig,
    AutoTokenizer,
)
from contextlib import nullcontext
from tqdm import tqdm
import json
import copy
import datasets
from peft import LoraConfig, PeftConfig
from transformers import default_data_collator, Trainer

  torch.utils._pytree._register_pytree_node(


In [2]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
model_name = "CodeLlama-7b-hf"
dataset_id = "data" # "wangrongsheng/CodeAlpaca_20K"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtyp=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    # load_in_8bit=True,
    quantization_config=bnb_config, 
)

model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def get_preprocessed_cmg_history(dataset_id, tokenizer, split):
    dataset = datasets.load_dataset(dataset_id, split=split)

    def apply_prompt_template(sample):
        return {
            "prompt": sample["prompt"],
            "message": sample["completion"],
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

    # mx = 0

    def tokenize_add_label(sample):
        prompt = tokenizer.encode(tokenizer.bos_token + sample["prompt"], add_special_tokens=False, max_length=200, truncation=True)
        message = tokenizer.encode(sample["message"] +  tokenizer.eos_token, max_length=400, truncation=True, add_special_tokens=False)
        max_length = 601 - len(prompt) - len(message)
        # mx = max(mx, len(prompt) + len(message))
        if max_length < 0:
            print("OK")

        pad = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False, max_length=max_length, padding='max_length', truncation=True)

        sample = {
            "input_ids": prompt + message + pad,
            "attention_mask" : [1] * (len(prompt) + len(message) + len(pad)),
            "labels": [-100] * len(prompt) + message + [-100] * len(pad),
            }

        return sample
    
    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))

    # print(mx)
    return dataset

In [5]:
train_dataset = get_preprocessed_cmg_history(dataset_id, tokenizer, 'train')

In [6]:
train_dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [7]:
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,
        lora_alpha=64,
        lora_dropout=0.1,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    # model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config


model, lora_config = create_peft_config(model)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=0.5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=True,
    logging_strategy="steps",
    save_strategy="no",
    gradient_checkpointing=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)

trainable params: 2,097,152 || all params: 6,740,643,840 || trainable%: 0.031112042851977773


In [8]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")



Step,Training Loss
10,0.8734
20,0.5919
30,0.5532
40,0.5757
50,0.5075
60,0.4414
70,0.4988
80,0.4233
90,0.4578
100,0.427


In [9]:
model.eval()

eval_prompt = """Create a Python class with the following attributes: firstname, lastname and address.
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = tokenizer.decode(model.generate(**model_input, max_new_tokens=400, pad_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True)

print(output)



Create a Python class with the following attributes: firstname, lastname and address.
 class Person:
    def __init__(self, firstname, lastname, address):
        self.firstname = firstname
        self.lastname = lastname
        self.address = address


## 模型推理

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

model_name = "CodeLlama-7b-hf"
lora_path = 'trained-model' # lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
# 加载LoRA权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Create a Python class with the following attributes: firstname, lastname and address. class Person:
    def __init__(self, firstname, lastname, address):
        self.firstname = firstname
        self.lastname = lastname
        self.address = address


In [3]:
prompt = "Given a string, write an algorithm to remove all occurrences of a given letter. my_string = \"Python is a popular programming language\" letter = 'o'"
model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = tokenizer.decode(model.generate(**model_input, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True)

print(output)

Given a string, write an algorithm to remove all occurrences of a given letter. my_string = "Python is a popular programming language" letter = 'o' def remove_letter(my_string, letter):
    new_string = ''
    for char in my_string:
        if char != letter:
            new_string += char
    return new_string

my_string = "Python is a popular programming language"
letter = 'o'

print(remove_letter(my_string, letter))


In [4]:
def remove_letter(my_string, letter):
    new_string = ''
    for char in my_string:
        if char != letter:
            new_string += char
    return new_string

my_string = "Python is a popular programming language"
letter = 'o'

print(remove_letter(my_string, letter))

Pythn is a ppular prgramming language
