# LLM PROJECT - FINE-TUNING NOTEBOOK

→ 7min on Nvidia T4 (5,000 ~ 10,000 samples)\
→ QLoRA on choosen dataset  \
→ Need around 16 GB CUDA Memory (or change batch size)

### Installation

In [None]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth evaluate rouge-score faiss-cpu sentence-transformers bert_score
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo evaluate rouge-score faiss-cpu sentence-transformers bert_score
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import evaluate
import sys
from tqdm import tqdm
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

### Unsloth

In [None]:
max_seq_length = 2048
dtype = None # 
load_in_4bit = True # Use 4bit quantization to reduce memory usage.
r = 4 # LoRA Rank

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B",   # <- Any Base model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.17: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.568 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.17 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",   
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

Unsloth 2025.3.17 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


### Data Preparation

In [None]:
# DATASET N°1: MED
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset["train"].select(range(5000, 15000)).remove_columns("instruction")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 1

In [None]:
# DATASET N°2: LoL
dataset = load_dataset("json", data_files="qa_lol.json")
dataset = dataset.rename_columns({"question": "input", "answer": "output"})
dataset = dataset.shuffle(seed=42)

dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 2

In [None]:
if dataset_choice == 1:
    prompt = """You are a physician. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""

if dataset_choice == 2:
    prompt = """You are a data scientist specialized in a video game. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

train_data = train_data.map(formatting_prompts_func, batched=True)
test_data = test_data.map(formatting_prompts_func, batched=True)

### Train the model

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 64,  # <- Can be reduced if no enough CUDA memory
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

In [5]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,000 | Num Epochs = 1 | Total steps = 35
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 4 x 1) = 256
 "-____-"     Trainable parameters = 6,078,464/3,000,000,000 (0.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.6752
2,1.6483
3,1.6222
4,1.5878
5,1.5009
6,1.3475
7,1.2292
8,1.1015
9,1.0215
10,0.9573


In [None]:
model.save_pretrained(f"lora_model_r{r}") 
tokenizer.save_pretrained(f"lora_model_r{r}")

('lora_model_r4_2/tokenizer_config.json',
 'lora_model_r4_2/special_tokens_map.json',
 'lora_model_r4_2/tokenizer.json')

In [None]:
!tar -czf archive_r4.tar.gz lora_model_r4 # Modify for rank r