In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Llama-3.1` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# def formatting_prompts_func(examples):
#     convos = examples["conversations"]
#     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
#     return { "text" : texts, }
# pass

# from datasets import load_dataset
# dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

We now use `standardize_sharegpt` to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:
```
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What is 2+2?"}
{"from": "gpt", "value": "It's 4."}
```
to
```
{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What is 2+2?"}
{"role": "assistant", "content": "It's 4."}
```

In [5]:
# from unsloth.chat_templates import standardize_sharegpt
# dataset = standardize_sharegpt(dataset)
# dataset = dataset.map(formatting_prompts_func, batched = True,)

We look at how the conversations are structured for item 5:

In [6]:
def format_custom_dataset(example):
    dialogues = example['dialogue']
    summary = example['summary']

    conversation = [
        {"role": "user", "content": dialogues},
        {"role": "assistant", "content": summary},
    ]

    text = tokenizer.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=False
    )
    return {"text": text}

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/samsum-train.csv')

In [9]:
# Drop rows where 'dialogue' or 'summary' columns have None values
df = df.dropna(subset=['dialogue', 'summary'])

In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_custom_dataset)

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

In [11]:
df = pd.read_csv("/content/samsum-test.csv")
test_dataset = Dataset.from_pandas(df)
test_dataset = test_dataset.map(format_custom_dataset)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [12]:
dataset[5]["summary"]

"Wyatt reminds Neville his wedding anniversary is on the 17th of September. Neville's wife is upset and it might be because Neville forgot about their anniversary."

And we see how the chat template transformed these conversations.

**[Notice]** Llama 3.1 Instruct's default chat template default adds `"Cutting Knowledge Date: December 2023\nToday Date: 26 July 2024"`, so do not be alarmed!

In [13]:
dataset[5]["text"]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nNeville: Hi there, does anyone remember what date I got married on?\r\nDon: Are you serious?\r\nNeville: Dead serious. We're on vacation, and Tina's mad at me about something. I have a strange suspicion that this might have something to do with our wedding anniversary, but I have nowhere to check.\r\nWyatt: Hang on, I'll ask my wife.\r\nDon: Haha, someone's in a lot of trouble :D\r\nWyatt: September 17. I hope you remember the year ;)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWyatt reminds Neville his wedding anniversary is on the 17th of September. Neville's wife is upset and it might be because Neville forgot about their anniversary.<|eot_id|>"

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/14731 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [15]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

We verify masking is actually done:

In [16]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nNeville: Hi there, does anyone remember what date I got married on?\r\nDon: Are you serious?\r\nNeville: Dead serious. We're on vacation, and Tina's mad at me about something. I have a strange suspicion that this might have something to do with our wedding anniversary, but I have nowhere to check.\r\nWyatt: Hang on, I'll ask my wife.\r\nDon: Haha, someone's in a lot of trouble :D\r\nWyatt: September 17. I hope you remember the year ;)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWyatt reminds Neville his wedding anniversary is on the 17th of September. Neville's wife is upset and it might be because Neville forgot about their anniversary.<|eot_id|>"

In [17]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                                                                                                               \n\nWyatt reminds Neville his wedding anniversary is on the 17th of September. Neville's wife is upset and it might be because Neville forgot about their anniversary.<|eot_id|>"

We can see the System and Instruction prompts are successfully masked!

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.178 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,731 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.1169
2,2.1829
3,1.7931
4,1.8941
5,1.9393
6,1.5217
7,1.4812
8,1.5326
9,1.1631
10,1.4718


In [20]:
import torch

# Initialize variables
start_gpu_memory = torch.cuda.memory_reserved() / 1024 / 1024 / 1024  # Memory in GB at the start
max_memory = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024  # Max memory in GB

# Calculate memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Print stats
trainer_stats = {
    'metrics': {'train_runtime': 3600}  # Replace with actual `trainer_stats` from your code
}
print(f"{trainer_stats['metrics']['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats['metrics']['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


3600 seconds used for training.
60.0 minutes used for training.
Peak reserved memory = 3.652 GB.
Peak reserved memory for training = -0.0 GB.
Peak reserved memory % of max memory = 24.762 %.
Peak reserved memory for training % of max memory = -0.0 %.


# Saving the model

In [21]:
save_path = '/content/drive/MyDrive/NLP/A3'


In [22]:
model.save_pretrained_merged(save_path, tokenizer, save_method="merged_16bit")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.04 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 21.57it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving /content/drive/MyDrive/NLP/A3/pytorch_model-00001-of-00002.bin...
Unsloth: Saving /content/drive/MyDrive/NLP/A3/pytorch_model-00002-of-00002.bin...
Done.


# Load

In [23]:
load_path = '/content/drive/MyDrive/NLP/A3'  # The same path used for saving

In [24]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=load_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [25]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": """ Ivan: hey eric
Eric: yeah man
Ivan: so youre coming to the wedding
Eric: your brother's
Ivan: yea
Eric: i dont know mannn
Ivan: YOU DONT KNOW??
Eric: i just have a lot to do at home, plus i dont know if my parents would let me
Ivan: ill take care of your parents
Eric: youre telling me you have the guts to talk to them XD
Ivan: thats my problem
Eric: okay man, if you say so
Ivan: yea just be there
Eric: alright"""},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n Ivan: hey eric\nEric: yeah man\nIvan: so youre coming to the wedding\nEric: your brother's\nIvan: yea\nEric: i dont know mannn\nIvan: YOU DONT KNOW??\nEric: i just have a lot to do at home, plus i dont know if my parents would let me\nIvan: ill take care of your parents\nEric: youre telling me you have the guts to talk to them XD\nIvan: thats my problem\nEric: okay man, if you say so\nIvan: yea just be there\nEric: alright<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nIt seems like Eric is hesitant to attend Ivan's brother's wedding, and Ivan is trying to convince him to come. Ivan is offering to handle Eric's parents, which suggests that Ivan has a good relationship with Eric's family. Ivan is confident and determined to have Eric attend the wedding, but Eric is still unsure."]

In [26]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d6a46925eca77d4042a23ef56a1408548262ecd78325515623a233ae3dc02732
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [29]:
import gc
import torch

# 1. Force garbage collection
gc.collect()
torch.cuda.empty_cache()

# 2. Move the model to CPU temporarily
model.cpu()

# 3. Force garbage collection and empty cache again
gc.collect()
torch.cuda.empty_cache()

# 4. Move the model back to GPU if needed
model.cuda()



In [30]:
inputs = inputs.to("cuda")  # Move my_tensor to the GPU


# Evaluting Performance

In [32]:
df = pd.read_csv("/content/samsum-test.csv")
test_dataset = Dataset.from_pandas(df)
test_dataset = test_dataset.map(format_custom_dataset)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [33]:
import evaluate
import nltk
import numpy as np
import torch # Added import

nltk.download('punkt')

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.cpu() # Move labels to CPU

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE - directly access the fmeasure value
    result = {key: value * 100 for key, value in result.items()} # Instead of value.mid.fmeasure, use value directly

    # Other metrics
    # Move predictions to CPU before using NumPy
    prediction_lens = [np.count_nonzero(pred.cpu().numpy() != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels]) # BLEU expects references to be a list of lists
    result["bleu"] = bleu_result["bleu"]

    return result


test_dataset = test_dataset.select(range(10))

# Ensure model is on CUDA
model.to("cuda") # Move the entire model to GPU

# Get predictions
inputs = tokenizer(test_dataset["text"], return_tensors="pt", padding=True, truncation=True).to("cuda")
predictions = model.generate(**inputs)

# Compute metrics
metrics = compute_metrics((predictions, inputs["input_ids"]))

# Print the metrics
print(metrics)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]



{'rouge1': 80.3947, 'rouge2': 80.267, 'rougeL': 80.3315, 'rougeLsum': 80.2637, 'gen_len': 462.8, 'bleu': 0.7130766941616256}


 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!