In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-09-16 22:13:53.591175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 22:13:53.723956: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 22:13:53.756967: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 22:13:54.006906: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Ti. Max memory: 15.697 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset('json', data_files='anselmcombined.json',split = "train")
#dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [4]:
print(dataset["input"])

['As the right order requires us to believe the deep things of Christian faith before we undertake to discuss them by reason; so to my mind it appears a neglect if, after we are established in the faith, we do not seek to understand what we believe. Therefore, since I thus consider myself to hold the faith of our redemption, by the prevenient grace of God, so that, even were I unable in any way to understand what I believe, still nothing could shake my constancy; I desire that you should discover to me, what, as you know, many besides myself ask, for what necessity and cause God, who is omnipotent, should have assumed the littleness and weakness of human nature for the sake of its renewal?', 'You ought not so much to fear this, because you should call to mind, on the other hand, that it often happens in the discussion of some question that God opens what before lay concealed; and that you should hope for the grace of God, because if you liberally impart those things which you have free

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [6]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4060 Ti. Max memory = 15.697 GB.
5.984 GB of memory reserved.


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 425 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 53
 "-____-"     Number of trainable parameters = 41,943,040
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmhasik[0m ([33mdlprojekt[0m). Use [1m`wandb login --relogin`[0m to force relogin


  2%|▏         | 1/53 [00:03<03:15,  3.75s/it]

{'loss': 3.1207, 'grad_norm': 1.1631232500076294, 'learning_rate': 4e-05, 'epoch': 0.02}


  4%|▍         | 2/53 [00:08<03:47,  4.46s/it]

{'loss': 2.989, 'grad_norm': 0.9261380434036255, 'learning_rate': 8e-05, 'epoch': 0.04}


  6%|▌         | 3/53 [00:11<03:15,  3.90s/it]

{'loss': 3.3164, 'grad_norm': 1.3989704847335815, 'learning_rate': 0.00012, 'epoch': 0.06}


  8%|▊         | 4/53 [00:16<03:15,  3.99s/it]

{'loss': 2.9196, 'grad_norm': 1.0879429578781128, 'learning_rate': 0.00016, 'epoch': 0.08}


  9%|▉         | 5/53 [00:19<03:04,  3.85s/it]

{'loss': 2.7891, 'grad_norm': 0.9968624711036682, 'learning_rate': 0.0002, 'epoch': 0.09}


 11%|█▏        | 6/53 [00:30<04:56,  6.31s/it]

{'loss': 2.3889, 'grad_norm': 0.7589634656906128, 'learning_rate': 0.00019583333333333334, 'epoch': 0.11}


 13%|█▎        | 7/53 [00:33<04:02,  5.27s/it]

{'loss': 2.2183, 'grad_norm': 3.861748695373535, 'learning_rate': 0.00019166666666666667, 'epoch': 0.13}


 15%|█▌        | 8/53 [00:38<03:42,  4.93s/it]

{'loss': 2.0211, 'grad_norm': 1.1497576236724854, 'learning_rate': 0.0001875, 'epoch': 0.15}


 17%|█▋        | 9/53 [00:42<03:24,  4.65s/it]

{'loss': 1.9513, 'grad_norm': 1.1958483457565308, 'learning_rate': 0.00018333333333333334, 'epoch': 0.17}


 19%|█▉        | 10/53 [00:48<03:37,  5.05s/it]

{'loss': 1.8465, 'grad_norm': 1.013210415840149, 'learning_rate': 0.0001791666666666667, 'epoch': 0.19}


 21%|██        | 11/53 [00:53<03:36,  5.16s/it]

{'loss': 1.7204, 'grad_norm': 0.9236598014831543, 'learning_rate': 0.000175, 'epoch': 0.21}


 23%|██▎       | 12/53 [00:59<03:43,  5.46s/it]

{'loss': 1.6539, 'grad_norm': 0.6197692155838013, 'learning_rate': 0.00017083333333333333, 'epoch': 0.23}


 25%|██▍       | 13/53 [01:05<03:41,  5.55s/it]

{'loss': 1.5011, 'grad_norm': 0.5086093544960022, 'learning_rate': 0.0001666666666666667, 'epoch': 0.24}


 26%|██▋       | 14/53 [01:08<03:04,  4.74s/it]

{'loss': 1.4596, 'grad_norm': 0.7381858229637146, 'learning_rate': 0.00016250000000000002, 'epoch': 0.26}


 28%|██▊       | 15/53 [01:11<02:46,  4.38s/it]

{'loss': 1.5367, 'grad_norm': 0.6327241659164429, 'learning_rate': 0.00015833333333333332, 'epoch': 0.28}


 30%|███       | 16/53 [01:15<02:33,  4.15s/it]

{'loss': 1.7146, 'grad_norm': 0.5214734077453613, 'learning_rate': 0.00015416666666666668, 'epoch': 0.3}


 32%|███▏      | 17/53 [01:19<02:28,  4.12s/it]

{'loss': 1.5464, 'grad_norm': 0.5158833265304565, 'learning_rate': 0.00015000000000000001, 'epoch': 0.32}


 34%|███▍      | 18/53 [01:24<02:29,  4.29s/it]

{'loss': 1.3218, 'grad_norm': 0.7671040892601013, 'learning_rate': 0.00014583333333333335, 'epoch': 0.34}


 36%|███▌      | 19/53 [01:27<02:14,  3.95s/it]

{'loss': 1.3854, 'grad_norm': 0.5809298157691956, 'learning_rate': 0.00014166666666666668, 'epoch': 0.36}


 38%|███▊      | 20/53 [01:31<02:15,  4.12s/it]

{'loss': 1.7329, 'grad_norm': 0.5744266510009766, 'learning_rate': 0.0001375, 'epoch': 0.38}


 40%|███▉      | 21/53 [01:38<02:33,  4.81s/it]

{'loss': 1.6604, 'grad_norm': 0.5051424503326416, 'learning_rate': 0.00013333333333333334, 'epoch': 0.39}


 42%|████▏     | 22/53 [01:41<02:11,  4.24s/it]

{'loss': 1.4253, 'grad_norm': 0.7669482827186584, 'learning_rate': 0.00012916666666666667, 'epoch': 0.41}


 43%|████▎     | 23/53 [01:45<02:09,  4.32s/it]

{'loss': 1.6929, 'grad_norm': 0.6649197340011597, 'learning_rate': 0.000125, 'epoch': 0.43}


 45%|████▌     | 24/53 [01:48<01:52,  3.89s/it]

{'loss': 1.2332, 'grad_norm': 0.6854566931724548, 'learning_rate': 0.00012083333333333333, 'epoch': 0.45}


 47%|████▋     | 25/53 [01:52<01:52,  4.02s/it]

{'loss': 1.6391, 'grad_norm': 0.5348272919654846, 'learning_rate': 0.00011666666666666668, 'epoch': 0.47}


 49%|████▉     | 26/53 [01:55<01:37,  3.63s/it]

{'loss': 1.3842, 'grad_norm': 0.6731576919555664, 'learning_rate': 0.00011250000000000001, 'epoch': 0.49}


 51%|█████     | 27/53 [01:59<01:34,  3.62s/it]

{'loss': 1.2892, 'grad_norm': 0.6451560258865356, 'learning_rate': 0.00010833333333333333, 'epoch': 0.51}


 53%|█████▎    | 28/53 [02:03<01:35,  3.82s/it]

{'loss': 1.4608, 'grad_norm': 0.6158775687217712, 'learning_rate': 0.00010416666666666667, 'epoch': 0.53}


 55%|█████▍    | 29/53 [02:07<01:35,  3.97s/it]

{'loss': 1.5308, 'grad_norm': 0.5793768763542175, 'learning_rate': 0.0001, 'epoch': 0.54}


 57%|█████▋    | 30/53 [02:10<01:25,  3.70s/it]

{'loss': 1.3889, 'grad_norm': 0.7616289854049683, 'learning_rate': 9.583333333333334e-05, 'epoch': 0.56}


 58%|█████▊    | 31/53 [02:14<01:18,  3.58s/it]

{'loss': 1.4802, 'grad_norm': 0.7396008968353271, 'learning_rate': 9.166666666666667e-05, 'epoch': 0.58}


 60%|██████    | 32/53 [02:19<01:23,  3.97s/it]

{'loss': 1.5905, 'grad_norm': 0.7021297216415405, 'learning_rate': 8.75e-05, 'epoch': 0.6}


 62%|██████▏   | 33/53 [02:23<01:22,  4.12s/it]

{'loss': 1.3889, 'grad_norm': 0.6414597034454346, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.62}


 64%|██████▍   | 34/53 [02:26<01:10,  3.69s/it]

{'loss': 1.2709, 'grad_norm': 0.8756112456321716, 'learning_rate': 7.916666666666666e-05, 'epoch': 0.64}


 66%|██████▌   | 35/53 [02:29<01:06,  3.71s/it]

{'loss': 1.7154, 'grad_norm': 0.6474641561508179, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.66}


 68%|██████▊   | 36/53 [02:36<01:18,  4.63s/it]

{'loss': 1.6394, 'grad_norm': 0.49614378809928894, 'learning_rate': 7.083333333333334e-05, 'epoch': 0.68}


 70%|██████▉   | 37/53 [02:40<01:10,  4.42s/it]

{'loss': 1.5553, 'grad_norm': 0.6346777081489563, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.69}


 72%|███████▏  | 38/53 [02:44<01:03,  4.25s/it]

{'loss': 1.4529, 'grad_norm': 0.630550742149353, 'learning_rate': 6.25e-05, 'epoch': 0.71}


 74%|███████▎  | 39/53 [02:49<01:00,  4.34s/it]

{'loss': 1.4654, 'grad_norm': 0.6248255968093872, 'learning_rate': 5.833333333333334e-05, 'epoch': 0.73}


 75%|███████▌  | 40/53 [02:54<00:58,  4.53s/it]

{'loss': 1.5046, 'grad_norm': 0.5467891693115234, 'learning_rate': 5.4166666666666664e-05, 'epoch': 0.75}


 77%|███████▋  | 41/53 [03:03<01:12,  6.05s/it]

{'loss': 1.5055, 'grad_norm': 0.461108922958374, 'learning_rate': 5e-05, 'epoch': 0.77}


 79%|███████▉  | 42/53 [03:07<00:58,  5.31s/it]

{'loss': 1.3382, 'grad_norm': 0.5211907029151917, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.79}


 81%|████████  | 43/53 [03:11<00:49,  4.95s/it]

{'loss': 1.4867, 'grad_norm': 0.4577232301235199, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.81}


 83%|████████▎ | 44/53 [03:17<00:47,  5.25s/it]

{'loss': 1.3964, 'grad_norm': 0.5254561305046082, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.83}


 85%|████████▍ | 45/53 [03:20<00:36,  4.53s/it]

{'loss': 1.4454, 'grad_norm': 0.5909994840621948, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.85}


 87%|████████▋ | 46/53 [03:24<00:31,  4.54s/it]

{'loss': 1.2113, 'grad_norm': 0.4644288718700409, 'learning_rate': 2.916666666666667e-05, 'epoch': 0.86}


 89%|████████▊ | 47/53 [03:28<00:25,  4.31s/it]

{'loss': 1.5442, 'grad_norm': 0.5259106755256653, 'learning_rate': 2.5e-05, 'epoch': 0.88}


 91%|█████████ | 48/53 [03:33<00:22,  4.57s/it]

{'loss': 1.4668, 'grad_norm': 0.524803876876831, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.9}


 92%|█████████▏| 49/53 [03:36<00:16,  4.17s/it]

{'loss': 1.2259, 'grad_norm': 0.4738200902938843, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.92}


 94%|█████████▍| 50/53 [03:46<00:17,  5.75s/it]

{'loss': 1.6657, 'grad_norm': 0.4452120065689087, 'learning_rate': 1.25e-05, 'epoch': 0.94}


 96%|█████████▌| 51/53 [03:50<00:10,  5.23s/it]

{'loss': 1.5894, 'grad_norm': 0.5504857897758484, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.96}


 98%|█████████▊| 52/53 [03:53<00:04,  4.67s/it]

{'loss': 1.4133, 'grad_norm': 0.5532413721084595, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.98}




{'loss': 1.4104, 'grad_norm': 0.5907325744628906, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 53/53 [03:58<00:00,  4.49s/it]

{'train_runtime': 240.0327, 'train_samples_per_second': 1.771, 'train_steps_per_second': 0.221, 'train_loss': 1.690587140479178, 'epoch': 1.0}





In [8]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

240.0327 seconds used for training.
4.0 minutes used for training.
Peak reserved memory = 9.316 GB.
Peak reserved memory for training = 3.332 GB.
Peak reserved memory % of max memory = 59.349 %.
Peak reserved memory for training % of max memory = 21.227 %.


In [9]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "As the right order requires us to believe the deep things of Christian faith before we undertake to discuss them by reason; so to my mind it appears a neglect if, after we are established in the faith, we do not seek to understand what we believe. Therefore, since I thus consider myself to hold the faith of our redemption, by the prevenient grace of God, so that, even were I unable in any way to understand what I believe, still nothing could shake my constancy; I desire that you should discover to me, what, as you know, many besides myself ask, for what necessity and cause God, who is omnipotent, should have assumed the littleness and weakness of human nature for the sake of its renewal?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nAs the right order requires us to believe the deep things of Christian faith before we undertake to discuss them by reason; so to my mind it appears a neglect if, after we are established in the faith, we do not seek to understand what we believe. Therefore, since I thus consider myself to hold the faith of our redemption, by the prevenient grace of God, so that, even were I unable in any way to understand what I believe, still nothing could shake my constancy; I desire that you should discover to me, what, as you know, many besides myself ask, for what necessity and cause God, who is omnipotent, should have assumed the littleness and weakness of human nature for the sake of its renewal?\n\n### Respo

In [10]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "Is God able to remove uprightness from the will?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nIs God able to remove uprightness from the will?\n\n### Response:\nIf uprightness is taken away from the will, then the will is not free.<|end_of_text|>']

In [11]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "I do not doubt this", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are Anselm, in dialogue with a student about an aspect of your philosophy\n\n### Input:\nI do not doubt this\n\n### Response:\nTherefore, if it were not true that there is no being which is not either necessary or possible, it would be possible for there to be a being which was neither necessary nor possible.<|end_of_text|>']

In [12]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


In [5]:
if True: #na true żeby skorzyustać do ingerence
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are Anselm, in dialogue with a student about an aspect of your philosophy", # instruction
        "I do not doubt this", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Ti. Max memory: 15.697 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are Anselm, in dialogue with a student about an aspect of your philosophy

### Input:
I do not doubt this

### Response:
Therefore, if the man who does not know that he ought to be killed, and who does not know that he ought to kill, is not killed, he is not innocent.<|end_of_text|>


In [14]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m",maximum_memory_usage = 0.8)
#if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.46 out of 15.56 RAM for saving.


 22%|██▏       | 7/32 [00:00<00:01, 13.18it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [09:30<00:00, 17.84s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
