In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
)


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 291/291 [00:29<00:00,  9.91it/s, Materializing param=model.norm.weight]                               


In [2]:
from datasets import load_dataset

ds1 = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
ds2 = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")

print(ds1)              # -> splits: test
print(ds1["test"][0])   # au lieu de "train"

print(ds2)              # -> splits: passages
print(ds2["passages"][0])  # au lieu de "train"


DatasetDict({
    test: Dataset({
        features: ['question', 'answer', 'id'],
        num_rows: 918
    })
})
{'question': 'Was Abraham Lincoln the sixteenth President of the United States?', 'answer': 'yes', 'id': 0}
DatasetDict({
    passages: Dataset({
        features: ['passage', 'id'],
        num_rows: 3200
    })
})
{'passage': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', 'id': 0}


In [3]:
qa_splits = ds1["test"].train_test_split(test_size=0.2, seed=42)
train_ds = qa_splits["train"]
val_ds   = qa_splits["test"]


In [4]:
def format_example(example):
    question = example["question"]
    answer = example["answer"]

    # Essaie de récupérer un contexte s'il existe
    if "positive_ctxs" in example:
        ctx_list = example["positive_ctxs"]
        context = "\n\n".join([ctx["text"] for ctx in ctx_list])

    elif "contexts" in example:
        context = "\n\n".join(example["contexts"])
    else:
        context = ""

    prompt = (
        "You are a helpful assistant. Use the context to answer the question.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Answer:"
    )

    example["prompt"] = prompt
    example["target"] = answer
    return example

train_fmt = train_ds.map(format_example)
val_fmt   = val_ds.map(format_example)

print(train_fmt[0]["prompt"])
print(train_fmt[0]["target"])


Map: 100%|██████████| 734/734 [00:00<00:00, 12949.14 examples/s]
Map: 100%|██████████| 184/184 [00:00<00:00, 11031.96 examples/s]

You are a helpful assistant. Use the context to answer the question.

Context:


Question: Has Singapore Changi Airport a network of 81 airlines connecting Singapore to 185 cities in 58 countries ?

Answer:
Yes





In [5]:
max_len = 512

def tokenize_fn(example):
    full_text = example["prompt"] + " " + example["target"]
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=max_len,
        padding="max_length",
    )

    input_ids = tokenized["input_ids"]
    labels = input_ids.copy()

    # Longueur réelle du prompt pour mettre -100 sur cette partie
    prompt_ids = tokenizer(
        example["prompt"],
        truncation=True,
        max_length=max_len,
        padding="max_length",
    )["input_ids"]
    prompt_len = sum(1 for t in prompt_ids if t != tokenizer.pad_token_id)

    labels[:prompt_len] = [-100] * prompt_len
    tokenized["labels"] = labels
    return tokenized

train_tok = train_fmt.map(tokenize_fn, remove_columns=train_fmt.column_names)
val_tok   = val_fmt.map(tokenize_fn, remove_columns=val_fmt.column_names)

print(train_tok)
print(train_tok[0])


Map: 100%|██████████| 734/734 [00:00<00:00, 736.04 examples/s]
Map: 100%|██████████| 184/184 [00:00<00:00, 754.40 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 734
})
{'input_ids': [1, 995, 460, 264, 10865, 13892, 28723, 5938, 272, 2758, 298, 4372, 272, 2996, 28723, 13, 13, 2083, 28747, 13, 13, 13, 24994, 28747, 10981, 17366, 689, 602, 28710, 16795, 264, 3681, 302, 28705, 28783, 28740, 264, 22201, 19135, 17366, 298, 28705, 28740, 28783, 28782, 9245, 297, 28705, 28782, 28783, 5780, 1550, 13, 13, 2820, 16981, 28747, 5592, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2




In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mistral-rag-mini-lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="no",    # ou "steps" si tu veux évaluer pendant le training
    bf16=True,
)


In [8]:
n_train = len(train_tok)
n_val   = len(val_tok)
print("train size:", n_train, "val size:", n_val)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,   # pas de select(range(2000)) car tu n'en as que ~734
    # si tu actives eval_strategy="steps", ajoute:
    # eval_dataset=val_tok,
)

trainer.train()

# Sauvegarde des poids LoRA + tokenizer
trainer.save_model("./mistral-rag-mini-lora")
tokenizer.save_pretrained("./mistral-rag-mini-lora")


train size: 734 val size: 184


Step,Training Loss
10,0.924657
20,0.02373
30,0.019397
40,0.023989


('./mistral-rag-mini-lora\\tokenizer_config.json',
 './mistral-rag-mini-lora\\tokenizer.json')

In [9]:
from transformers import pipeline

# model = ton modèle Mistral + LoRA déjà entraîné dans ce notebook
# tokenizer = le tokenizer Mistral déjà chargé

gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    temperature=0.2,
    top_p=0.9,
)

ex = val_fmt[0]
print("PROMPT:\n", ex["prompt"])
print("GOLD:\n", ex["target"])

output = gen(ex["prompt"])[0]["generated_text"]
generated_answer = output[len(ex["prompt"]):].strip()
print("GEN:\n", generated_answer)



Passing `generation_config` together with generation-related arguments=({'top_p', 'max_new_tokens', 'temperature'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=128) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


PROMPT:
 You are a helpful assistant. Use the context to answer the question.

Context:


Question: What did Cleveland's opponents say in 1884 to counter his innocent image?

Answer:
GOLD:
 That he had fathered an illegitimate child
GEN:
 He was a drunkard, a gambler, and a womanizer.
