In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-09-12 23:05:19.608443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-12 23:05:19.736776: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-12 23:05:19.769717: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-12 23:05:20.011300: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Ti. Max memory: 15.697 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset('json', data_files='anselmcombined.json',split = "train")
#dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
print(dataset["input"])

In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2): 100%|██████████| 425/425 [00:00<00:00, 583.29 examples/s]


In [5]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4060 Ti. Max memory = 15.697 GB.
5.984 GB of memory reserved.


In [6]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 425 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 53
 "-____-"     Number of trainable parameters = 41,943,040
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmhasik[0m ([33mdlprojekt[0m). Use [1m`wandb login --relogin`[0m to force relogin


  2%|▏         | 1/53 [00:04<04:05,  4.72s/it]

{'loss': 3.1207, 'grad_norm': 1.1631015539169312, 'learning_rate': 4e-05, 'epoch': 0.02}


  4%|▍         | 2/53 [00:09<04:09,  4.89s/it]

{'loss': 2.989, 'grad_norm': 0.9265272617340088, 'learning_rate': 8e-05, 'epoch': 0.04}


  6%|▌         | 3/53 [00:12<03:27,  4.14s/it]

{'loss': 3.3145, 'grad_norm': 1.3950570821762085, 'learning_rate': 0.00012, 'epoch': 0.06}


  8%|▊         | 4/53 [00:17<03:22,  4.13s/it]

{'loss': 2.9186, 'grad_norm': 1.0893628597259521, 'learning_rate': 0.00016, 'epoch': 0.08}


  9%|▉         | 5/53 [00:20<03:08,  3.93s/it]

{'loss': 2.7889, 'grad_norm': 0.99735027551651, 'learning_rate': 0.0002, 'epoch': 0.09}


 11%|█▏        | 6/53 [00:31<05:00,  6.39s/it]

{'loss': 2.3865, 'grad_norm': 0.7623046040534973, 'learning_rate': 0.00019583333333333334, 'epoch': 0.11}


 13%|█▎        | 7/53 [00:34<04:05,  5.33s/it]

{'loss': 2.222, 'grad_norm': 3.96307110786438, 'learning_rate': 0.00019166666666666667, 'epoch': 0.13}


 15%|█▌        | 8/53 [00:39<03:44,  4.98s/it]

{'loss': 2.0206, 'grad_norm': 1.1101001501083374, 'learning_rate': 0.0001875, 'epoch': 0.15}


 17%|█▋        | 9/53 [00:43<03:25,  4.66s/it]

{'loss': 1.9533, 'grad_norm': 1.1970609426498413, 'learning_rate': 0.00018333333333333334, 'epoch': 0.17}


 19%|█▉        | 10/53 [00:49<03:36,  5.04s/it]

{'loss': 1.8475, 'grad_norm': 1.007408618927002, 'learning_rate': 0.0001791666666666667, 'epoch': 0.19}


 21%|██        | 11/53 [00:54<03:35,  5.14s/it]

{'loss': 1.7224, 'grad_norm': 0.9675021171569824, 'learning_rate': 0.000175, 'epoch': 0.21}


 23%|██▎       | 12/53 [01:00<03:42,  5.43s/it]

{'loss': 1.6526, 'grad_norm': 0.5853391289710999, 'learning_rate': 0.00017083333333333333, 'epoch': 0.23}


 25%|██▍       | 13/53 [01:06<03:41,  5.54s/it]

{'loss': 1.5004, 'grad_norm': 0.5024228692054749, 'learning_rate': 0.0001666666666666667, 'epoch': 0.24}


 26%|██▋       | 14/53 [01:09<03:04,  4.74s/it]

{'loss': 1.4606, 'grad_norm': 0.7267906665802002, 'learning_rate': 0.00016250000000000002, 'epoch': 0.26}


 28%|██▊       | 15/53 [01:12<02:46,  4.38s/it]

{'loss': 1.5372, 'grad_norm': 0.5818747282028198, 'learning_rate': 0.00015833333333333332, 'epoch': 0.28}


 30%|███       | 16/53 [01:16<02:33,  4.16s/it]

{'loss': 1.7143, 'grad_norm': 0.5207467079162598, 'learning_rate': 0.00015416666666666668, 'epoch': 0.3}


 32%|███▏      | 17/53 [01:20<02:28,  4.14s/it]

{'loss': 1.5461, 'grad_norm': 0.5086492896080017, 'learning_rate': 0.00015000000000000001, 'epoch': 0.32}


 34%|███▍      | 18/53 [01:25<02:30,  4.29s/it]

{'loss': 1.3249, 'grad_norm': 0.7643275260925293, 'learning_rate': 0.00014583333333333335, 'epoch': 0.34}


 36%|███▌      | 19/53 [01:28<02:14,  3.95s/it]

{'loss': 1.3868, 'grad_norm': 0.5845029354095459, 'learning_rate': 0.00014166666666666668, 'epoch': 0.36}


 38%|███▊      | 20/53 [01:32<02:15,  4.11s/it]

{'loss': 1.7326, 'grad_norm': 0.5603316426277161, 'learning_rate': 0.0001375, 'epoch': 0.38}


 40%|███▉      | 21/53 [01:39<02:33,  4.80s/it]

{'loss': 1.6592, 'grad_norm': 0.5057582259178162, 'learning_rate': 0.00013333333333333334, 'epoch': 0.39}


 42%|████▏     | 22/53 [01:42<02:11,  4.23s/it]

{'loss': 1.4252, 'grad_norm': 0.7626693844795227, 'learning_rate': 0.00012916666666666667, 'epoch': 0.41}


 43%|████▎     | 23/53 [01:46<02:09,  4.30s/it]

{'loss': 1.691, 'grad_norm': 0.6605997681617737, 'learning_rate': 0.000125, 'epoch': 0.43}


 45%|████▌     | 24/53 [01:49<01:52,  3.87s/it]

{'loss': 1.2345, 'grad_norm': 0.6849856972694397, 'learning_rate': 0.00012083333333333333, 'epoch': 0.45}


 47%|████▋     | 25/53 [01:53<01:52,  4.00s/it]

{'loss': 1.6401, 'grad_norm': 0.5381805896759033, 'learning_rate': 0.00011666666666666668, 'epoch': 0.47}


 49%|████▉     | 26/53 [01:56<01:37,  3.62s/it]

{'loss': 1.3835, 'grad_norm': 0.6723594069480896, 'learning_rate': 0.00011250000000000001, 'epoch': 0.49}


 51%|█████     | 27/53 [02:00<01:34,  3.62s/it]

{'loss': 1.2899, 'grad_norm': 0.6453679203987122, 'learning_rate': 0.00010833333333333333, 'epoch': 0.51}


 53%|█████▎    | 28/53 [02:04<01:36,  3.85s/it]

{'loss': 1.4616, 'grad_norm': 0.6142983436584473, 'learning_rate': 0.00010416666666666667, 'epoch': 0.53}


 55%|█████▍    | 29/53 [02:08<01:36,  4.02s/it]

{'loss': 1.5314, 'grad_norm': 0.5775894522666931, 'learning_rate': 0.0001, 'epoch': 0.54}


 57%|█████▋    | 30/53 [02:12<01:26,  3.76s/it]

{'loss': 1.3892, 'grad_norm': 0.7664423584938049, 'learning_rate': 9.583333333333334e-05, 'epoch': 0.56}


 58%|█████▊    | 31/53 [02:15<01:19,  3.63s/it]

{'loss': 1.4795, 'grad_norm': 0.7454195618629456, 'learning_rate': 9.166666666666667e-05, 'epoch': 0.58}


 60%|██████    | 32/53 [02:20<01:24,  4.03s/it]

{'loss': 1.5924, 'grad_norm': 0.702470600605011, 'learning_rate': 8.75e-05, 'epoch': 0.6}


 62%|██████▏   | 33/53 [02:24<01:23,  4.18s/it]

{'loss': 1.3895, 'grad_norm': 0.642578661441803, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.62}


 64%|██████▍   | 34/53 [02:27<01:11,  3.75s/it]

{'loss': 1.2726, 'grad_norm': 0.8806663155555725, 'learning_rate': 7.916666666666666e-05, 'epoch': 0.64}


 66%|██████▌   | 35/53 [02:31<01:07,  3.76s/it]

{'loss': 1.7154, 'grad_norm': 0.6477622389793396, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.66}


 68%|██████▊   | 36/53 [02:38<01:19,  4.70s/it]

{'loss': 1.6389, 'grad_norm': 0.4986840486526489, 'learning_rate': 7.083333333333334e-05, 'epoch': 0.68}


 70%|██████▉   | 37/53 [02:42<01:11,  4.48s/it]

{'loss': 1.5568, 'grad_norm': 0.636566698551178, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.69}


 72%|███████▏  | 38/53 [02:46<01:04,  4.29s/it]

{'loss': 1.4529, 'grad_norm': 0.6290115714073181, 'learning_rate': 6.25e-05, 'epoch': 0.71}


 74%|███████▎  | 39/53 [02:50<01:01,  4.38s/it]

{'loss': 1.4655, 'grad_norm': 0.6252288222312927, 'learning_rate': 5.833333333333334e-05, 'epoch': 0.73}


 75%|███████▌  | 40/53 [02:55<00:59,  4.57s/it]

{'loss': 1.5044, 'grad_norm': 0.544661819934845, 'learning_rate': 5.4166666666666664e-05, 'epoch': 0.75}


 77%|███████▋  | 41/53 [03:05<01:13,  6.08s/it]

{'loss': 1.506, 'grad_norm': 0.4604634940624237, 'learning_rate': 5e-05, 'epoch': 0.77}


 79%|███████▉  | 42/53 [03:08<00:58,  5.34s/it]

{'loss': 1.339, 'grad_norm': 0.5218367576599121, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.79}


 81%|████████  | 43/53 [03:13<00:49,  4.99s/it]

{'loss': 1.4862, 'grad_norm': 0.4576904773712158, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.81}


 83%|████████▎ | 44/53 [03:19<00:47,  5.29s/it]

{'loss': 1.3981, 'grad_norm': 0.5252121686935425, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.83}


 85%|████████▍ | 45/53 [03:21<00:36,  4.57s/it]

{'loss': 1.4441, 'grad_norm': 0.5879704356193542, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.85}


 87%|████████▋ | 46/53 [03:26<00:32,  4.58s/it]

{'loss': 1.2103, 'grad_norm': 0.4631348252296448, 'learning_rate': 2.916666666666667e-05, 'epoch': 0.86}


 89%|████████▊ | 47/53 [03:30<00:26,  4.34s/it]

{'loss': 1.5438, 'grad_norm': 0.526247501373291, 'learning_rate': 2.5e-05, 'epoch': 0.88}


 91%|█████████ | 48/53 [03:35<00:22,  4.60s/it]

{'loss': 1.4672, 'grad_norm': 0.5248260498046875, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.9}


 92%|█████████▏| 49/53 [03:38<00:16,  4.18s/it]

{'loss': 1.2251, 'grad_norm': 0.4722428619861603, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.92}


 94%|█████████▍| 50/53 [03:48<00:17,  5.74s/it]

{'loss': 1.6665, 'grad_norm': 0.44678133726119995, 'learning_rate': 1.25e-05, 'epoch': 0.94}


 96%|█████████▌| 51/53 [03:52<00:10,  5.23s/it]

{'loss': 1.5878, 'grad_norm': 0.5502683520317078, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.96}


 98%|█████████▊| 52/53 [03:55<00:04,  4.66s/it]

{'loss': 1.4137, 'grad_norm': 0.5541126132011414, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.98}


100%|██████████| 53/53 [03:58<00:00,  4.20s/it]

{'loss': 1.411, 'grad_norm': 0.589325487613678, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 53/53 [04:09<00:00,  4.70s/it]

{'train_runtime': 252.0945, 'train_samples_per_second': 1.686, 'train_steps_per_second': 0.21, 'train_loss': 1.6907932106054053, 'epoch': 1.0}





In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [8]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "As the right order requires us to believe the deep things of Christian faith before we undertake to discuss them by reason; so to my mind it appears a neglect if, after we are established in the faith, we do not seek to understand what we believe. Therefore, since I thus consider myself to hold the faith of our redemption, by the prevenient grace of God, so that, even were I unable in any way to understand what I believe, still nothing could shake my constancy; I desire that you should discover to me, what, as you know, many besides myself ask, for what necessity and cause God, who is omnipotent, should have assumed the littleness and weakness of human nature for the sake of its renewal?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nAs the right order requires us to believe the deep things of Christian faith before we undertake to discuss them by reason; so to my mind it appears a neglect if, after we are established in the faith, we do not seek to understand what we believe. Therefore, since I thus consider myself to hold the faith of our redemption, by the prevenient grace of God, so that, even were I unable in any way to understand what I believe, still nothing could shake my constancy; I desire that you should discover to me, what, as you know, many besides myself ask, for what necessity and cause God, who is omnipotent, should have assumed the littleness and weakness of human nature for the sake of its renewal?\n\n### Respo

In [7]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "Is God able to remove uprightness from the will?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nIs God able to remove uprightness from the will?\n\n### Response:\nHe is able to do this, but he does not wish to.<|end_of_text|>']

In [9]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "I do not doubt this", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nI do not doubt this\n\n### Response:\nThen, if we should wish to make a definition of truth, it would be something like this: truth is that which is not contrary to rightness.<|end_of_text|>']