In [2]:
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)
from peft import PeftModel
import faiss
import numpy as np


In [3]:
ds_corpus = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
passages_ds = ds_corpus["passages"]

print(ds_corpus)
print(passages_ds[0])




DatasetDict({
    passages: Dataset({
        features: ['passage', 'id'],
        num_rows: 3200
    })
})
{'passage': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', 'id': 0}


In [None]:
texts = []
for ex in passages_ds:
    
    if isinstance(ex["passage"], list):
        texts.extend(ex["passage"])
    else:
        texts.append(ex["passage"])

print("Nb de passages:", len(texts))
print(texts[0][:300])


Nb de passages: 3200
Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.


In [7]:
from sentence_transformers import SentenceTransformer

# Modèle d'embedding simple et fiable
embed_name = "sentence-transformers/all-MiniLM-L6-v2"

model_e = SentenceTransformer(embed_name)
print("Embedding model loaded:", embed_name)

# Exemple de test
texts_test = [
    "Uruguay is a country in South America.",
    "Harry Potter is a series of fantasy novels written by J.K. Rowling.",
]

embs_test = model_e.encode(texts_test, convert_to_numpy=True, normalize_embeddings=True)
print("Embeddings shape:", embs_test.shape)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 756.86it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding model loaded: sentence-transformers/all-MiniLM-L6-v2
Embeddings shape: (2, 384)


In [8]:
# Optionnel : pour aller plus vite au début, tu peux limiter
# texts = texts[:2000]

embs = model_e.encode(
    texts,
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True,
)

dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product sur vecteurs normalisés ≈ cosine
index.add(embs)

print("Index dimension:", dim)
print("Index size:", index.ntotal)


Batches: 100%|██████████| 50/50 [00:02<00:00, 23.88it/s]

Index dimension: 384
Index size: 3200





In [9]:
def search(query, k=5):
    # Encode la question
    q_emb = model_e.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    # Recherche top-k dans l'index
    scores, idx = index.search(q_emb, k)
    idx = idx[0].tolist()
    scores = scores[0].tolist()
    # Retourne une liste (score, passage)
    return [(scores[i], texts[idx[i]]) for i in range(len(idx))]

# Test
test_query = "Who is the author of Harry Potter?"
retr = search(test_query, k=3)
for s, t in retr:
    print("Score:", s)
    print("Passage:", t[:200], "...\n")


Score: 0.41328710317611694
Passage: Grant writing his memoirs. ...

Score: 0.40589794516563416
Passage: * Davidson, Hugh M. Blaise Pascal. Boston: Twayne Publishers, 1983. ...

Score: 0.36122581362724304
Passage: *Garland, Hamlin, Ulysses S. Grant: His Life and Character, Macmillan Company, 1898. ...



In [10]:
model_name = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(base_model, "./mistral-rag-mini-lora")

print("Mistral + LoRA loaded.")


Loading weights: 100%|██████████| 291/291 [00:17<00:00, 16.52it/s, Materializing param=model.norm.weight]                              


Mistral + LoRA loaded.


In [11]:
gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.2,
    top_p=0.9,
)

def generate_answer(prompt):
    out = gen(prompt)[0]["generated_text"]
    return out


Passing `generation_config` together with generation-related arguments=({'top_p', 'temperature', 'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


In [12]:
def build_prompt(query, retrieved_passages):
    ctx_text = "\n\n".join([p for _, p in retrieved_passages])
    prompt = (
        "You are a helpful assistant. Use ONLY the following context to answer the question.\n\n"
        f"Context:\n{ctx_text}\n\n"
        f"Question: {query}\n\n"
        "Answer:"
    )
    return prompt


In [13]:
def rag_answer(query, k=5, show_prompt=False):
    retrieved = search(query, k=k)          # 1) retrieval dense
    prompt = build_prompt(query, retrieved) # 2) construction du prompt
    answer = generate_answer(prompt)        # 3) génération Mistral
    if show_prompt:
        print("=== PROMPT ===\n", prompt, "\n")
    return answer, retrieved, prompt

# Test
q = "Who is the author of Harry Potter?"
answer, retrieved, prompt = rag_answer(q, k=5, show_prompt=True)

print("=== ANSWER ===\n", answer)


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== PROMPT ===
 You are a helpful assistant. Use ONLY the following context to answer the question.

Context:
Grant writing his memoirs.

* Davidson, Hugh M. Blaise Pascal. Boston: Twayne Publishers, 1983.

*Garland, Hamlin, Ulysses S. Grant: His Life and Character, Macmillan Company, 1898.

* Farrell, John. "Pascal and Power". Chapter seven of Paranoia and Modernity: Cervantes to Rousseau (Cornell UP, 2006).

* Williams, L. Pearce (1971), Faraday: A Biography, Simon and Schuster. 

Question: Who is the author of Harry Potter?

Answer: 

=== ANSWER ===
 You are a helpful assistant. Use ONLY the following context to answer the question.

Context:
Grant writing his memoirs.

* Davidson, Hugh M. Blaise Pascal. Boston: Twayne Publishers, 1983.

*Garland, Hamlin, Ulysses S. Grant: His Life and Character, Macmillan Company, 1898.

* Farrell, John. "Pascal and Power". Chapter seven of Paranoia and Modernity: Cervantes to Rousseau (Cornell UP, 2006).

* Williams, L. Pearce (1971), Faraday: A B

In [14]:
# 1) Ajouter un passage artificiel dans le corpus
custom_fact = "The creator of this notebook is named Merouane."
texts.append(custom_fact)

# 2) Recalculer les embeddings pour ce nouveau passage uniquement
import numpy as np

new_emb = model_e.encode([custom_fact], convert_to_numpy=True, normalize_embeddings=True)
index.add(new_emb)  # on ajoute ce vecteur à l'index FAISS

print("New index size:", index.ntotal)


New index size: 3201


In [15]:
q = "What is the name of the creator of this notebook?"
retr = search(q, k=3)
for s, t in retr:
    print("Score:", s)
    print("Passage:", t, "\n")


Score: 0.7658967971801758
Passage: The creator of this notebook is named Merouane. 

Score: 0.4244401752948761
Passage: * " Nikola Tesla". IEEE History Center, 2005. 

Score: 0.42050060629844666
Passage: * Rybak, James P., "Nikola Tesla: Scientific Savant". Popular Electronics, 1042170X, Nov99, Vol. 16, Issue 11. 



In [16]:
answer, retrieved, prompt = rag_answer(q, k=3, show_prompt=True)
print("=== ANSWER ===\n", answer)


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== PROMPT ===
 You are a helpful assistant. Use ONLY the following context to answer the question.

Context:
The creator of this notebook is named Merouane.

* " Nikola Tesla". IEEE History Center, 2005.

* Rybak, James P., "Nikola Tesla: Scientific Savant". Popular Electronics, 1042170X, Nov99, Vol. 16, Issue 11.

Question: What is the name of the creator of this notebook?

Answer: 

=== ANSWER ===
 You are a helpful assistant. Use ONLY the following context to answer the question.

Context:
The creator of this notebook is named Merouane.

* " Nikola Tesla". IEEE History Center, 2005.

* Rybak, James P., "Nikola Tesla: Scientific Savant". Popular Electronics, 1042170X, Nov99, Vol. 16, Issue 11.

Question: What is the name of the creator of this notebook?

Answer: Merouane
