In [1]:
%pip install unsloth
# Also get the latest nightly Unsloth!
%pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from IPython.display import clear_output
clear_output(wait=False)

In [2]:
# 取得 pretrained model
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cpu" # "cuda" for GPU usage or "cpu" for CPU usage
checkpoint = "HuggingFaceTB/SmolLM-360M-Instruct"
# checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"
# checkpoint = "microsoft/Phi-3-mini-4k-instruct"
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir="./.cache")
model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir="./.cache").to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 取得 unsloth model
if False:
    from unsloth import FastLanguageModel
    import torch
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
    fourbit_models = [
        "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
        "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
        "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
        "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
        "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/gemma-2-9b-bnb-4bit",
        "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    ] # More models at https://huggingface.co/unsloth

    # 創建一般的transformer模型
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Phi-3.5-mini-instruct",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

In [4]:
# 讀取 Peft Model
if False:
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

In [5]:
from peft import get_peft_model, LoraConfig, TaskType

'''
透過 task_type 可以指定不同的任務類型，LoRA 並不只用在 Decoder LM 訓練上，
包含 Encoder 或 Seq2Seq 都是可以使用 LoRA 訓練的。

LoraConfig 裡面的 r 參數就是剛才介紹的 r，
用來決定小模型的大小，也會一定程度的決定學習效果。
但其實滿多研究指出，這個 r 的影響並不是很大，
通常就選個 8, 16, 32 之類順眼的數字用就好。

lora_alpha 則會決定小模型的影響程度，
也就是說 Alpha 值越高，越容易把大模型既有的能力給覆蓋掉。
而 lora_dropout 就與一般的 Dropout 概念相同，用來對抗 Overfitting 用的參數。
'''

# 讀取 Peft Model
peft_config = LoraConfig(
    r=8,
    lora_alpha=32, 
    lora_dropout=0.1,
    # task_type=TaskType.SEQ_2_SEQ_LM, 
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False, 
    # bias="none",
    # target_modules=["q_proj","v_proj"],
)

# 加入PEFT策略
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


trainable params: 819,200 || all params: 362,640,320 || trainable%: 0.2259


In [6]:
if False:
    from unsloth.chat_templates import get_chat_template
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    )

In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [None]:
from datasets import load_dataset
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
# 取得 dataset
import json
def getDataset():
    statelist = ['mistake', 'unrelevant', 'hestitate']
    state = statelist[0]

    dsfile = f"./datasets/dataset_{state}.json"

    with open(dsfile, "r") as jsondata:
        dataset = json.load(jsondata)
    return dataset

In [None]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    neftune_noise_alpha = 5,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
model.save_pretrained("./train/") # Local saving
tokenizer.save_pretrained("./train/")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
# from huggingface_hub import notebook_login

# notebook_login()
# model.push_to_hub("my_awesome_peft_model")

In [None]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig

peft_model_id = "smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)