In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#     "unsloth/llama-2-7b-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit",
#     "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
#     "unsloth/gemma-2b-bnb-4bit",
#     "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
#     "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `ChatML` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [4]:
from datasets import load_dataset, Dataset
import transformers

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["context"]
    inputs       = examples["questions"]
    outputs      = examples["answers"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# Loading dataset from json file

import json
contexts = []
questions_dataset = []
answers = []
context_data_files = [
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-vietnam.json"
]
dataset_files = [
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-vietnam.json"
]

context_data = {}
for i, file_path in enumerate(context_data_files):
    with open(file_path, "r") as file:
        context_data[i] = json.load(file)

questions = [
    "What is the name of the attraction?",
    "What is the location of the attraction?",
    "Describe the attraction in detail.",
    "What type of attraction is it? (e.g. historical, natural, amusement, beach)"
]


for i, file_path in enumerate(dataset_files):
    with open(file_path, "r") as file:
        dataset = json.load(file)
        for entry in dataset:
            id = entry['context_index']
            for question in questions:
                if question == entry['question'] and str(id) in context_data[i].keys():
                    contexts.append(context_data[i][str(id)])
                    questions_dataset.append(entry["question"])
                    answers.append(entry["answer"])



update = {
  "context": contexts,
  "questions": questions_dataset,
  "answers": answers
}

print(update)


# # Convert the data into a Dataset
# # dataset = Dataset.from_dict(file_input)
# # dataset = dataset.map(formatting_prompts_func, batched=True)

# data = {
#     "context": [
#         "Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi. On the north side of the fort, a rampart of red-brown laterite forms a jetty between two small sandy coves, known as Sinquerim Beach. The fort houses the Central Jail and a 19th-century lighthouse. It was built by the Portuguese between 1609-1612. The fort is open from 9:00 AM to 6:30 PM every day, and there is no entry fee.",
#         "Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi. On the north side of the fort, a rampart of red-brown laterite forms a jetty between two small sandy coves, known as Sinquerim Beach. The fort houses the Central Jail and a 19th-century lighthouse. It was built by the Portuguese between 1609-1612. The fort is open from 9:00 AM to 6:30 PM every day, and there is no entry fee.",
#         "Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi. On the north side of the fort, a rampart of red-brown laterite forms a jetty between two small sandy coves, known as Sinquerim Beach. The fort houses the Central Jail and a 19th-century lighthouse. It was built by the Portuguese between 1609-1612. The fort is open from 9:00 AM to 6:30 PM every day, and there is no entry fee.",
#         "Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi. On the north side of the fort, a rampart of red-brown laterite forms a jetty between two small sandy coves, known as Sinquerim Beach. The fort houses the Central Jail and a 19th-century lighthouse. It was built by the Portuguese between 1609-1612. The fort is open from 9:00 AM to 6:30 PM every day, and there is no entry fee."
#     ],
#     "questions": [
#         "What is the name of the attraction?",
#         "What is the location of the attraction?",
#         "Describe the attraction in brief",
#         "What type of attraction is it? (e.g. historical, natural, amusement, beach)"
#     ],
#     "answers": [
#         "Aguada Fort",
#         "Fort Aguada Rd, Aguada Fort Area, Candolim, Goa 403515",
#         "Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi. On the north side of the fort, a rampart of red-brown laterite forms a jetty between two small sandy coves, known as Sinquerim Beach. The fort houses the Central Jail and a 19th-century lighthouse. It was built by the Portuguese between 1609-1612. The fort is open from 9:00 AM to 6:30 PM every day, and there is no entry fee.",
#         "Historical"
#     ]
# }

# Convert the data into a Dataset
dataset = Dataset.from_dict(update)
print(dataset)
def tokenize_function(examples):
    context = examples["context"]
    questions = examples["questions"]
    answers = examples["answers"]

    input_texts = [f"Context: {context[i]} Question: {questions[i]}" for i in range(len(context))]
    target_texts = answers

    model_inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print(tokenized_datasets)


{'context': [" Aguada Fort: Beautiful Ambiance  Image Source  Sightseeing in Goa is incomplete without a visit to Fort Aguada, strategically situated at the estuary of the river Mandovi  On the north side of the fort, a rampart of red-brown laterite just into the bay to form a jetty between two small sandy coves  This picturesque spot is known as Sinquerim Beach  Location: Fort Aguada Rd, Aguada Fort Area, Candolim, Goa 403515 Timings: 9:00 AM – 6:30 PM all days of the week Built By: Portuguese Built-In: 1609-1612 Houses: The Central Jail and a 19th Century Lighthouse How To Reach: The fort is located on the Aguada-Siolim Road and can be easily reached by road Entry Fee: No entry fee  Must Read: 26 Beach Resorts In Goa  Planning your holiday in Goa but confused about what to do? These Goa travel stories help you find your best trip ever! Romance, Beaches, & Churches! Rahul Talks Of A Sizzling Honeymoon In Goa No wonder Goa is the Beach Capital of India!  Read More     Iresh Lists The M

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'questions', 'answers', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 536
})


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments



trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "context",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/536 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [6]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.605 GB of memory reserved.


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 536 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.4968
2,2.804
3,2.65
4,2.5704
5,2.566
6,2.3579
7,2.2898
8,2.2286
9,2.0458
10,2.1045


In [8]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

267.2367 seconds used for training.
4.45 minutes used for training.
Peak reserved memory = 10.502 GB.
Peak reserved memory for training = 4.897 GB.
Peak reserved memory % of max memory = 71.21 %.
Peak reserved memory for training % of max memory = 33.205 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [9]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Give only the name of the attraction from the input.", # instruction
        "Hokkaido \u2013 Closer To Nature  Image Credit: fisag for Pixabay  The abode of natural hot springs! Popular for its volcanoes, hot springs, and ski areas, this gorgeous Japanese island looks right out of a picture book  And it is because of its various attractions & experiences that it is an impeccable place to holiday with both your kids and significant other  Be it the beauty of the Blue Pond or the Zoo, you\u2019d be left mesmerized  It is one of the most ideal places to visit in Japan during summer  Top Attractions:  Asahiyama Zoo Farm Tomita Blue Pond  Best Things To Do:  Relax in the hot spring Treat yourself with scrumptious seafood Visit the famous national parks  Places to stay:  Yorkshire Farm Hotel Park Hills Hokkaido Puremiahoteru \u2013 CABIN \u2013 Obihiro  Places to eat:  Daruma Hokkaido Cafe Hokkaido Ramen Kyowakoku  How to reach: New Chitose Airport Sapporo is the closest airport to Hokkaido  You can find local taxis and cabs for a ride  Suggested Read: 10 Most Alluring Homestays in Japan  ", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give only the name of the attraction from the input.

### Input:
Hokkaido – Closer To Nature  Image Credit: fisag for Pixabay  The abode of natural hot springs! Popular for its volcanoes, hot springs, and ski areas, this gorgeous Japanese island looks right out of a picture book  And it is because of its various attractions & experiences that it is an impeccable place to holiday with both your kids and significant other  Be it the beauty of the Blue Pond or the Zoo, you’d be left mesmerized  It is one of the most ideal places to visit in Japan during summer  Top Attractions:  Asahiyama Zoo Farm Tomita Blue Pond  Best Things To Do:  Relax in the hot spring Treat yourself with scrumptious seafood Visit the famous national parks  Places to stay:  Yorkshire Farm Hotel Park Hills Hokkaido Puremiahoter

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [10]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [11]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nThe Eiffel Tower is a famous tall tower in Paris.\n\n### Explanation:\nThe Eiffel Tower is a famous tall tower in Paris and is one of the most popular tourist attractions in the city.\n\n### Keywords:\nEiffel Tower, Paris, tourist attractions, famous, tall tower, most popular tourist attractions,']

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [12]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [13]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [14]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")