Python Pakete, Modell und Tokenizer installieren

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -qU transformers accelerate bitsandbytes pandas==2.0.3 peft trl

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
import torch
from trl import setup_chat_format
from google.colab import userdata

TOKEN = userdata.get("HF_TOKEN")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

MODEL_NAME = 'meta-llama/Meta-Llama-3-8B'
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    token=TOKEN)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=TOKEN)
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

In diesem Abschnitt wird der Datensatz in das ChatML Format gebracht

In [None]:
def split_chat(string,lst):
    idx = string[5:].find("ASSISTANT:")
    idy = string[5:].find("USER:")
    idz = string[5:].find("FUNCTION RESPONSE:")
    idx = min3(idx, idy, idz)
    if idx == -1:
        lst.append(string.rstrip("\n"))
        return
    idx += 5
    lst.append(string[:idx].rstrip("\n"))
    split_chat(string[idx:], lst)

def min2(x, y):
    if x == -1:
       return y
    if y == -1:
       return x
    return min(x, y)

def min3(x, y, z):
    if x == -1:
        return min2(y, z)
    if y == -1:
        return min2(x, z)
    if z == -1:
        return min2(x, y)
    return min(x, y, z)

In [None]:
def format_dataset(sample):
  chat = sample['chat']
  system = sample['system'].lstrip("SYSTEM:")[1:].rstrip("\n")
  msgs = []
  split_chat(chat, msgs)
  msgs_format = []
  e = {"role": "system", "content": system}
  msgs_format.append(e)
  for m in msgs:
    if m.startswith("USER:"):
      e = {"role": "user", "content": m.lstrip("USER:")[1:]}
    elif m.startswith("ASSISTANT:"):
      e = {"role": "assistant", "content": m.lstrip("ASSISTANT:")[1:].rstrip("<|endoftext|>")}
    elif m.startswith("FUNCTION RESPONSE:"):
      e = {"role": "user", "content": m}
    else:
      continue
    msgs_format.append(e)
  return {"messages": msgs_format}

In [None]:
from datasets import load_dataset

dataset = load_dataset("glaiveai/glaive-function-calling-v2", split="train[:1%]")

In [None]:
dataset_formatted = dataset.map(format_dataset)
dataset_formatted = dataset_formatted.remove_columns(['chat', 'system'])

dataset_formatted = dataset_formatted.train_test_split(test_size=0.3)

In [None]:
print(dataset_formatted["train"][2])

In [None]:
dataset_formatted["train"].to_json("train_dataset.json", orient="records")
dataset_formatted["test"].to_json("test_dataset.json", orient="records")

In diesem Abschnitt findet das Fine-Tuning statt

In [None]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

In [None]:
import transformers
from transformers import TrainingArguments


args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    #bf16=True,
    #tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

In [None]:
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

In [None]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

In [None]:
trainer.train()

trainer.save_model()

In [None]:
del model
del trainer

torch.cuda.empty_cache()

In [None]:
trained_model = "llama3-8B-function-calling"

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained(trained_model, safe_serialization=True, max_shard_size="4GB")

Lokale Daten zum Drive kopieren

In [None]:
!cp -r /content/llama3-8B-function-calling /content/drive/MyDrive/llama3-8B-function-calling

Pfad zum gespeicherten Modell

In [None]:
trained_model = "/content/drive/MyDrive/" + trained_model

In diesem Abschnitt wird das Modell getestet

In [None]:
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
from transformers import pipeline
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    trained_model,
    device_map='auto',
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(trained_model)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

In [None]:
from random import randint

rand_idx = randint(0, len(eval_dataset))
chat = eval_dataset[rand_idx]["messages"]

prompt = pipe.tokenizer.apply_chat_template(chat[:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"Eingabe:\n{chat[1]['content']}")
print(f"Erwartet:\n{chat[2]['content']}")
print(f"Ausgabe:\n{outputs[0]['generated_text'][len(prompt):].strip()}")


In diesem Abschnitt kann das Modell mit eigenen Eingaben ausgeführt werden

In [None]:
messages = [
    {"role": "system", "content": """You are a helpful assistant with access to the following functions. Use them if required -
{
    ""name"": ""get_exchange_rate"",
    ""description"": ""Get the exchange rate between two currencies"",
    ""parameters"": {
        ""type"": ""object"",
        ""properties"": {
            ""base_currency"": {
                ""type"": ""string"",
                ""description"": ""The currency to convert from""
            },
            ""target_currency"": {
                ""type"": ""string"",
                ""description"": ""The currency to convert to""
            }
        },
        ""required"": [
            ""base_currency"",
            ""target_currency""
        ]
    }
}"""},
    {"role": "user", "content": "Can you convert EURO to US Dollars?"},
]

In [None]:
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"User: {messages[1]['content']}")
print(f"Assistant :\n{outputs[0]['generated_text'][len(prompt):].strip()}")