# LLM PROJECT - RAG NOTEBOOK

→ 40min on Nvidia T4 \
→ Compute BLEU-4, ROUGE-1, ROUGE-2, ROUGE-L, METEOR, Cosine Similarity, BERTSCORE using RAG  \
→ Need around 16 GB CUDA Memory (or change batch size)

### Installation

In [1]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth evaluate rouge-score faiss-cpu sentence-transformers bert_score
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo evaluate rouge-score faiss-cpu sentence-transformers bert_score
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth



In [1]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import evaluate
import sys
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


### Model

In [2]:
rag_info = "ft_r2" # Whatever information for archiving, such as "fine-tuned", "base_model", "LoRA_rank=4", ...

In [3]:
max_seq_length = 2048
dtype = None # 
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "meta-llama/Llama-3.2-3B",   # <- Any Base model
    model_name = "lora_model_r2",            # <- Or fine-tuned model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Inference mode

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.568 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.18 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

### Data Preparation

In [4]:
# DATASET N°1: MED
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset["train"].select(range(5000, 15000)).remove_columns("instruction")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = dataset["train"]
test_data = dataset["test"]

dataset_choice = 1

In [None]:
# # DATASET N°2: LoL                # Uncomment to use this dataset
# dataset = load_dataset("json", data_files="qa_lol.json")
# dataset = dataset.rename_columns({"question": "input", "answer": "output"})
# dataset = dataset.shuffle(seed=42)

# dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
# train_data = dataset["train"]
# test_data = dataset["test"]

# dataset_choice = 2

In [5]:
test_data = test_data.filter(lambda x: x["output"] != "")
full_data = concatenate_datasets([train_data, test_data])

### RAG Preparation

In [6]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_answer = embedder.encode(full_data["output"], show_progress_bar=True)

Batches: 100%|██████████| 313/313 [00:09<00:00, 33.27it/s] 


In [7]:
dimension = embeddings_answer.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_answer)

In [8]:
def retrieve_context(query, k=3):
    query_embeddings = embedder.encode(query, batch_size=32, show_progress_bar=False)
    distances, indices = index.search(np.array(query_embeddings), k)

    batch_contexts = []
    for idx_set in indices:
        contexts = [full_data["output"][i] for i in idx_set]
        batch_contexts.append(contexts)

    return batch_contexts

In [None]:
if dataset_choice == 1:
    prompt_RAG = """You are a physician. Below is a question. You have access to a database, where you retrieved some potential information about the question. Write a response that appropriately completes the request. If you don't know the answer, just say that you don't know. If possible, use the database and find a answer as close as the one in the database.

### Question:
{}

### Database:
{}

### Answer:
{}"""

if dataset_choice == 2:
    prompt_RAG = """You are a professional League of Legends expert. You must answer only questions related to champions using the information available in the database. If the answer is not in the database, simply reply: "I don't know." Your answers must be short, precise, and contain no extra information.

### Question:
{}

### Database:
{}

### Answer:
{}"""

In [10]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/onyxia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM',

In [None]:
def compute_cosine_similarity(predictions, references):
    pred_embeddings = embedder.encode(predictions, convert_to_tensor=True)
    ref_embeddings = embedder.encode(references, convert_to_tensor=True)

    cosine_scores = torch.nn.functional.cosine_similarity(pred_embeddings, ref_embeddings)
    return sum(cosine_scores) / len(cosine_scores)


def compute_metrics_rag(test_data):
    FastLanguageModel.for_inference(model)
    total_rouge1, total_rouge2, total_rougeL, total_bleu, total_meteor, total_bertscore, total_cos = 0, 0, 0, 0, 0, 0, 0
    num_samples = len(test_data)
    batch_size = 32
    num_batches = (num_samples + batch_size - 1) // batch_size

    pbar = tqdm(total=num_batches, desc="Processing", unit="batch")

    for i in range(0, num_samples, batch_size):
        batch = test_data[i:i+batch_size]
        

        contexts = retrieve_context(batch["input"], k=3)

        prompts = [prompt_RAG.format(query, context[:1600], "") for query, context in zip(batch["input"], contexts)]

        response = generator(prompts, max_new_tokens=128, do_sample=True, temperature=0.7)
        outputs = [every_response[0]["generated_text"].split("Answer:")[-1].strip() if every_response[0]["generated_text"].split("Answer:")[1].strip() else "No response" for every_response in response]

        reference_text = batch["output"]

        rouge_scores = rouge.compute(predictions=outputs, references=reference_text)
        bleu_scores = bleu.compute(predictions=outputs, references=reference_text)
        meteor_scores = meteor.compute(predictions=outputs, references=reference_text)
        bert_scores = bertscore.compute(predictions=outputs, references=reference_text, lang="en")
        cos_scores = compute_cosine_similarity(predictions=outputs, references=reference_text)


        total_rouge1 += rouge_scores["rouge1"].item()
        total_rouge2 += rouge_scores["rouge2"].item()
        total_rougeL += rouge_scores["rougeL"].item()
        total_bleu += bleu_scores["bleu"]
        total_meteor += meteor_scores["meteor"].item()
        list_bertscore = bert_scores["f1"]
        bert_scores = sum(list_bertscore) / len(list_bertscore)
        total_bertscore += bert_scores
        total_cos += cos_scores.item()

        pbar.set_postfix(
            rouge1=rouge_scores["rouge1"],
            rouge2=rouge_scores["rouge2"],
            rougeL=rouge_scores["rougeL"],
            bleu=bleu_scores["bleu"],
            meteor=meteor_scores["meteor"],
            bertscore=bert_scores,
            cosine_sim=cos_scores.item()
        )

        pbar.update(1)

    avg_rouge1 = total_rouge1 / num_batches
    avg_rouge2 = total_rouge2 / num_batches
    avg_rougeL = total_rougeL / num_batches
    avg_bleu = total_bleu / num_batches
    avg_meteor = total_meteor / num_batches
    avg_bert = total_bertscore / num_batches
    avg_cos = total_cos / num_batches

    pbar.set_postfix(
        rouge1=avg_rouge1,
        rouge2=avg_rouge2,
        rougeL=avg_rougeL,
        bleu=avg_bleu,
        meteor=avg_meteor,
        bertscore=avg_bert,
        cosine_sim=avg_cos
    )

    pbar.close()

    time_e = pbar.format_dict["elapsed"]
    m, s = divmod(time_e, 60)




    return {
        "avg_rouge1": avg_rouge1,
        "avg_rouge2": avg_rouge2,
        "avg_rougeL": avg_rougeL,
        "avg_bleu": avg_bleu,
        "avg_meteor": avg_meteor,
        "avg_bert": avg_bert,
        "avg_cos": avg_cos,
        "time" : f"{m}:{s:.2f}"
    }

In [12]:
res = compute_metrics_rag(test_data)
print(res)

with open(f"results_rag_{rag_info}_dataset{dataset_choice}.txt", "a") as f:
    f.write(str(res) +"\n")

Processing:   0%|          | 0/32 [00:00<?, ?batch/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing:  31%|███▏      | 10/32 [12:51<29:25, 80.24s/batch, bertscore=0.942, bleu=0.435, cosine_sim=0.864, meteor=0.68, rouge1=0.681, rouge2=0.6, rougeL=0.617]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 32/32 [44:05<00:00, 82.66s/batch, bertscore=0.931, bleu=0.265, cosine_sim=0.864, meteor=0.553, rouge1=0.58, rouge2=0.501, rougeL=0.532]  

{'avg_rouge1': 0.5800994768014688, 'avg_rouge2': 0.5014356843894736, 'avg_rougeL': 0.5316870438911251, 'avg_bleu': 0.26477442447998334, 'avg_meteor': 0.5533433302242828, 'avg_bert': 0.9309115857218525, 'avg_cos': 0.863942900672555, 'time': '44.0:5.24'}



