# LLM PROJECT - EVALUATION NOTEBOOK

→ 40min on Nvidia T4 (~6M parameters after QLoRA) \
→ Compute BLEU-4, ROUGE-1, ROUGE-2, ROUGE-L, METEOR, Cosine Similarity, BERTSCORE using Fine-tuning (QLoRA) \
→ Need around 16 GB CUDA Memory (or change batch size)

### Installation

In [1]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth evaluate rouge-score faiss-cpu sentence-transformers bert_score
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo evaluate rouge-score faiss-cpu sentence-transformers bert_score
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.3.16-py3-none-any.whl.metadata (8.0 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   

In [1]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import evaluate
import sys
from tqdm import tqdm
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Model

In [2]:
eval_info = "base" # Whatever information for archiving, such as "fine-tuned", "base_model", "LoRA_rank=4", ...

In [3]:
max_seq_length = 2048
dtype = None #
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B",   # <- Any Base model
    # model_name = "lora_model_3",            # <- Or fine-tuned model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Inference mode

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

### Data Preparation

In [6]:
# DATASET N°1: MED
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset["train"].select(range(5000, 15000)).remove_columns("instruction")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 1

In [5]:
# DATASET N°2: LoL
dataset = load_dataset("json", data_files="qa_lol.json")
dataset = dataset.rename_columns({"question": "input", "answer": "output"})
dataset = dataset.shuffle(seed=42)

dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 2

FileNotFoundError: Unable to find '/content/qa_lol.json'

In [7]:
if dataset_choice == 1:
    prompt = """You are a physician. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""

if dataset_choice == 2:
    prompt = """You are a data scientist specialized in a video game. Below is a question. Write a response that appropriately answer the question.

### Question:
{}

### Answer:
{}"""

test_data = test_data.filter(lambda x: x["output"] != "")

### Evaluation

In [8]:
# Charger les métriques
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
def compute_cosine_similarity(predictions, references):
    pred_embeddings = embedder.encode(predictions, convert_to_tensor=True)
    ref_embeddings = embedder.encode(references, convert_to_tensor=True)

    cosine_scores = torch.nn.functional.cosine_similarity(pred_embeddings, ref_embeddings)
    return sum(cosine_scores) / len(cosine_scores)

def compute_metrics(test_data):
    FastLanguageModel.for_inference(model)
    total_rouge1, total_rouge2, total_rougeL, total_bleu, total_meteor, total_bertscore, total_cos = 0, 0, 0, 0, 0, 0, 0
    num_samples = len(test_data)
    batch_size = 128
    num_batches = (num_samples + batch_size - 1) // batch_size

    pbar = tqdm(total=num_batches, desc="Processing", unit="batch")

    for i in range(0, num_samples, batch_size):
        batch = test_data[i:i+batch_size]

        inputs = tokenizer(
        [
            prompt.format(
                sample,
                "",
            ) for sample in batch["input"]
        ], return_tensors = "pt", padding=True).to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        outputs = [decod.split("Answer:")[1].strip() for decod in outputs]
        reference_text = batch["output"]

        # Calculer les métriques sur chaque prédiction
        rouge_scores = rouge.compute(predictions=outputs, references=reference_text)
        bleu_scores = bleu.compute(predictions=outputs, references=reference_text)
        meteor_scores = meteor.compute(predictions=outputs, references=reference_text)
        bert_scores = bertscore.compute(predictions=outputs, references=reference_text, lang="en")
        cos_scores = compute_cosine_similarity(predictions=outputs, references=reference_text)


        # Accumuler les scores
        total_rouge1 += rouge_scores["rouge1"].item()
        total_rouge2 += rouge_scores["rouge2"].item()
        total_rougeL += rouge_scores["rougeL"].item()
        total_bleu += bleu_scores["bleu"]
        total_meteor += meteor_scores["meteor"].item()
        list_bertscore = bert_scores["f1"]
        bert_scores = sum(list_bertscore) / len(list_bertscore)
        total_bertscore += bert_scores
        total_cos += cos_scores.item()

        # Mettre à jour la barre de progression avec les scores actuels
        pbar.set_postfix(
            rouge1=rouge_scores["rouge1"],
            rouge2=rouge_scores["rouge2"],
            rougeL=rouge_scores["rougeL"],
            bleu=bleu_scores["bleu"],
            meteor=meteor_scores["meteor"],
            bertscore=bert_scores,
            cosine_sim=cos_scores.item()
        )

        # Avancer la barre de progression
        pbar.update(1)

    # Calculer la moyenne des scores sur tout le batch
    avg_rouge1 = total_rouge1 / num_batches
    avg_rouge2 = total_rouge2 / num_batches
    avg_rougeL = total_rougeL / num_batches
    avg_bleu = total_bleu / num_batches
    avg_meteor = total_meteor / num_batches
    avg_bert = total_bertscore / num_batches
    avg_cos = total_cos / num_batches

    # Afficher les scores moyens
    pbar.set_postfix(
        rouge1=avg_rouge1,
        rouge2=avg_rouge2,
        rougeL=avg_rougeL,
        bleu=avg_bleu,
        meteor=avg_meteor,
        bertscore=avg_bert,
        cosine_sim=avg_cos
    )

    pbar.close()

    time_e = pbar.format_dict["elapsed"]
    m, s = divmod(time_e, 60)

    return {
        "avg_rouge1": avg_rouge1,
        "avg_rouge2": avg_rouge2,
        "avg_rougeL": avg_rougeL,
        "avg_bleu": avg_bleu,
        "avg_meteor": avg_meteor,
        "avg_bert": avg_bert,
        "avg_cos": avg_cos,
        "time" : f"{m}:{s:.2f}"
    }

In [None]:
res = compute_metrics(test_data)
print(res)




Processing:   0%|          | 0/8 [00:00<?, ?batch/s][A[A[A


Processing:   0%|          | 0/8 [00:55<?, ?batch/s, bertscore=0.867, bleu=0.12, cosine_sim=0.732, meteor=0.332, rouge1=0.309, rouge2=0.153, rougeL=0.227][A[A[A


Processing:  12%|█▎        | 1/8 [00:55<06:27, 55.41s/batch, bertscore=0.867, bleu=0.12, cosine_sim=0.732, meteor=0.332, rouge1=0.309, rouge2=0.153, rougeL=0.227][A[A[A

In [None]:
with open(f"results_eval_r{eval_info}_dataset{dataset_choice}.txt", "a") as f:
    f.write(str(res) + "\n")