In [39]:

# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install datasets

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-e29677x0/unsloth_9b69cc44e11f466886b24709756a2fb8
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-e29677x0/unsloth_9b69cc44e11f466886b24709756a2fb8
  Resolved https://github.com/unslothai/unsloth.git to commit 27fa021a7bb959a53667dd4e7cdb9598c207aa0d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from unsloth.chat_templates import get_chat_template

# Load the CSV file into a pandas DataFrame
data_path = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_clean.csv"
df = pd.read_csv(data_path)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [42]:
dataset

Dataset({
    features: ['answer', 'question'],
    num_rows: 3917
})

In [65]:
import pandas as pd
from datasets import Dataset


# Function to format the dataset
def formatting_prompts_func(examples):
    texts = [f"### Question: {q}\n ### Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return {"text": texts}

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

In [66]:
dataset[0]

{'answer': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.',
 'question': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?',
 'text': '### Question: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?\n ### Answer: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.'}

In [67]:
from transformers import AutoTokenizer
from unsloth.chat_templates import get_chat_template

unsloth_template = """
{{ bos_token }}
You are a helpful assistant to the user.
{% for message in messages %}
{% if message['from'] == 'human' %}
>>> User: {{ message['value'] }}
{% elif message['from'] == 'gpt' %}
>>> Assistant: {{ message['value'] }}{{ eos_token }}
{% endif %}
{% endfor %}
{% if add_generation_prompt %}
>>> Assistant:
{% endif %}
"""

unsloth_eos_token = "eos_token"


# Load the tokenizer
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Apply the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template=(unsloth_template, unsloth_eos_token), # Provide template and EOS token
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
    map_eos_token=True, # Maps  to </s> instead
)

# Verify template application
print("Template applied to tokenizer.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Template applied to tokenizer.


In [68]:
# Function to apply chat template to the dataset
def apply_chat_template_func(examples):
    convos = [
        {"messages": [
            {"from": "human", "value": q},
            {"from": "gpt", "value": a}
        ]}
        for q, a in zip(examples["question"], examples["answer"])
    ]

    try:
        # Apply the chat template directly to the messages
        processed_convos = [
            tokenizer.apply_chat_template(convo, tokenize=True, add_special_tokens=False)
            for convo in convos
        ]
        # Extract the text from the processed conversations
        texts = ["".join(tokenizer.decode(tokens) for tokens in convo) for convo in processed_convos]
    except Exception as e:
        print(f"Error in applying chat template: {e}")
        raise e

    return {"text": texts}

# Load the tokenizer and apply the chat template
from transformers import AutoTokenizer
from unsloth.chat_templates import get_chat_template

# Define model name and load tokenizer
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the template
unsloth_template = """
{{ bos_token }}
You are a helpful assistant to the user.
{% for message in messages %}
>>> {{ message['from'] }}: {{ message['value'] }}
{% endfor %}
{% if add_generation_prompt %}
>>> Assistant:
{% endif %}
"""

# Apply the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template=(unsloth_template, "eos_token"),  # Provide template and EOS token
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps to </s> instead
)

# Example conversation to verify the chat template
example_convo = {
    "messages": [
        {"from": "human", "value": "What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?"},
        {"from": "gpt", "value": "Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation."}
    ]
}

# Verify the template with a single example
try:
    print("Applying chat template to a single example...")
    print(f"Example conversation structure: {example_convo}")

    # Apply chat template
    result = tokenizer.apply_chat_template(example_convo['messages'], tokenize=False, add_generation_prompt=False)

    # Print intermediate results
    print("Intermediate Result:")
    for message in example_convo["messages"]:
        print(f"{message['from']}: {message['value']}")

    print(f"Final Result: {result}")
except Exception as e:
    print(f"Error in applying chat template: {e}")
    print(f"Template: {tokenizer.chat_template}")
    raise e

# Apply the chat template function to the dataset
dataset = dataset.map(apply_chat_template_func, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Applying chat template to a single example...
Example conversation structure: {'messages': [{'from': 'human', 'value': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?'}, {'from': 'gpt', 'value': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.'}]}
Intermediate Result:
human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.
Final Result: 
<s>
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited fo

Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

In [69]:
# Function to apply chat template to the dataset
def apply_chat_template_func(examples):
    convos = [
        {"messages": [
            {"from": "human", "value": q},
            {"from": "gpt", "value": a}
        ]}
        for q, a in zip(examples["question"], examples["answer"])
    ]

    try:
        # Apply the chat template directly to the messages
        processed_convos = [
            tokenizer.apply_chat_template(convo['messages'], tokenize=True, add_special_tokens=False)
            for convo in convos
        ]

        # Debugging: Print the structure of processed_convos
        print("Processed Conversations (First Example):", processed_convos[0])

        # Extract the text from the processed conversations
        texts = [tokenizer.decode(convo) for convo in processed_convos]
    except Exception as e:
        print(f"Error in applying chat template: {e}")
        raise e

    return {"text": texts}

# Apply the chat template function to the dataset
dataset = dataset.map(apply_chat_template_func, batched=True)

# Check the first few rows to ensure it processed correctly
print(dataset["text"][:5])


Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

Processed Conversations (First Example): [29871, 13, 1, 29871, 13, 3492, 526, 263, 8444, 20255, 304, 278, 1404, 29889, 13, 6778, 29958, 5199, 29901, 1724, 526, 278, 25486, 310, 278, 7972, 7744, 573, 29899, 6451, 839, 14614, 297, 13549, 1788, 363, 21635, 5864, 4023, 10147, 292, 29973, 13, 6778, 29958, 330, 415, 29901, 3295, 15603, 338, 385, 7744, 519, 13681, 29899, 2477, 297, 13549, 1788, 322, 967, 5858, 29889, 2184, 338, 480, 1573, 363, 21635, 5864, 4023, 10147, 292, 297, 6856, 29899, 18045, 470, 1283, 29899, 7720, 18893, 310, 5858, 29889, 13]
Processed Conversations (First Example): [29871, 13, 1, 29871, 13, 3492, 526, 263, 8444, 20255, 304, 278, 1404, 29889, 13, 6778, 29958, 5199, 29901, 1128, 947, 278, 19843, 310, 278, 3699, 29892, 411, 967, 6641, 29899, 29888, 9390, 16246, 24628, 322, 11520, 17526, 29879, 29892, 29126, 304, 967, 5864, 19201, 322, 12463, 2874, 297, 278, 3030, 310, 278, 6136, 322, 1209, 573, 23387, 14294, 10090, 313, 10764, 29956, 29956, 29956, 29897, 1904, 29973, 13

In [70]:
dataset

Dataset({
    features: ['answer', 'question', 'text'],
    num_rows: 3917
})

In [71]:
dataset[0]

{'answer': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.',
 'question': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?',
 'text': '\n<s> \nYou are a helpful assistant to the user.\n>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?\n>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.\n'}

In [72]:
print(dataset[0]["text"])


<s> 
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.



In [77]:
# Check the first few rows to ensure it processed correctly
for i in range(5):
    print(f"Processed Example {i}:")
    print(dataset["text"][i])
    print("\n")

# Save the processed dataset to a CSV file
processed_dataset_path = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_processed.csv"
dataset.to_csv(processed_dataset_path, index=False)

print(f"Processed dataset saved to {processed_dataset_path}")


Processed Example 0:

<s> 
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.



Processed Example 1:

<s> 
You are a helpful assistant to the user.
>>> human: What components are included in a typical solar energy system, and how are they configured to work together?
>>> gpt: A solar energy system comprises: a solar energy structure comprising photovoltaic solar panels contiguously covering an area; a first inverter configured to receive power from a first string of solar panels; and a second inverter.



Processed Example 2:

<s> 
You are a helpful assistant to the user.
>>> human: What control method is proposed for optimizing the solar-to-power efficiency of a solar-aided coal-fired power system under off-d

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processed dataset saved to /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_processed.csv


In [74]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [75]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Let's see how the `Phi-3` format works by printing the 5th element

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [84]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        #num_train_epochs=1, # Comment it when testing and uncomment max_steps
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, #Comment num_train_epochs=1 when actual training
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/3917 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [81]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
6.588 GB of memory reserved.


In [85]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,917 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss


KeyboardInterrupt: 

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `Phi-3`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "What components are included in a high-efficiency thermal drying system for drying purposes?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

# MAX NEW TOKENS DETERMINES THE LENGTH OF THE OUTPUT
outputs = model.generate(input_ids = inputs, max_new_tokens = 100, use_cache = True)
response=tokenizer.batch_decode(outputs)

In [None]:
# print(type(response))
# print(len(response))
# print(response)

In [None]:
print(response[0].split('<|assistant|>')[-1])

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "What is a famous tall tower in Paris?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer
    model = AutoModelForPeftCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. Gemma 6 trillion tokens is 2.5x faster! [free Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>