# LLM PROJECT - FINE-TUNING NOTEBOOK

### Installation

In [1]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth evaluate rouge-score faiss-cpu sentence-transformers bert_score
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo evaluate rouge-score faiss-cpu sentence-transformers bert_score
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

Collecting unsloth
  Downloading unsloth-2025.3.18-py3-none-any.whl.metadata (46 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting unsloth_zoo>=2025.3.14 (from unsloth)
  Downloading unsloth_zoo-2025.3.16-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro 

In [2]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import evaluate
import sys
from tqdm import tqdm
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


### Unsloth

In [3]:
max_seq_length = 2048
dtype = None # 
load_in_4bit = True # Use 4bit quantization to reduce memory usage.
r = 4 # LoRA Rank

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B",   # <- Any Base model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.568 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",   
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

Unsloth 2025.3.18 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


### Data Preparation

In [5]:
# DATASET N°1: MED
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset["train"].select(range(5000, 15000)).remove_columns("instruction")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 1

In [5]:
# DATASET N°2: LoL
dataset = load_dataset("json", data_files="qa_lol.json")
dataset = dataset.rename_columns({"question": "input", "answer": "output"})
dataset = dataset.shuffle(seed=42)

dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 2

Generating train split: 5015 examples [00:00, 96145.55 examples/s]


In [6]:
if dataset_choice == 1:
    prompt = """You are a physician. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""

if dataset_choice == 2:
    prompt = """You are a professional League of Legends expert. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

train_data = train_data.map(formatting_prompts_func, batched=True)
test_data = test_data.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 4012/4012 [00:00<00:00, 65842.95 examples/s]
Map: 100%|██████████| 1003/1003 [00:00<00:00, 45949.29 examples/s]


### Train the model

In [7]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 64,  # <- Can be reduced if no enough CUDA memory
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 4012/4012 [00:02<00:00, 1851.77 examples/s]


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,012 | Num Epochs = 1 | Total steps = 15
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 4 x 1) = 256
 "-____-"     Trainable parameters = 6,078,464/3,000,000,000 (0.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.8803
2,2.8605
3,2.8254
4,2.6685
5,2.4901
6,2.2114
7,1.9202
8,1.6006
9,1.3776
10,1.1877


In [10]:
model.save_pretrained(f"lora_model_r{r}_lol") 
tokenizer.save_pretrained(f"lora_model_r{r}_lol")

('lora_model_r4_lol/tokenizer_config.json',
 'lora_model_r4_lol/special_tokens_map.json',
 'lora_model_r4_lol/tokenizer.json')

In [11]:
!tar -czf archive_r4.tar.gz lora_model_r4_lol # Modify for rank r