# Best parameters for Retrieval

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from typing import List
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
def generate_embeddings(texts: List[str], model_name="all-mpnet-base-v2", device="cpu"):
    model = SentenceTransformer(model_name)

    # 🔐 Nettoyage des textes
    cleaned_texts = [str(t) if isinstance(t, str) else "" for t in texts]
    cleaned_texts = [t for t in cleaned_texts if t.strip() != ""]

    embeddings = model.encode(
        cleaned_texts,  # ✅ ici : on utilise bien le texte nettoyé
        convert_to_tensor=True,
        batch_size=64,
        show_progress_bar=True,
        device=device
    )
    return embeddings, model


In [None]:
def evaluate_retrieval(df: pd.DataFrame, embeddings, model, top_ks: List[int] = [1, 3, 5, 10], similarity="cosine", device="cpu") -> dict:
    scores = {f"recall@{k}": [] for k in top_ks}
    scores.update({f"precision@{k}": [] for k in top_ks})
    scores.update({f"mrr@{k}": [] for k in top_ks})
    sims = []

    for i, row in df.iterrows():
        query = row["question"]
        gt_idx = i
        query_emb = model.encode(query, convert_to_tensor=True, device=device)
        gt_chunk_emb = embeddings[gt_idx]
        sim = util.pytorch_cos_sim(query_emb, gt_chunk_emb).item()
        sims.append(sim)

        if similarity == "cosine":
            all_scores = util.pytorch_cos_sim(query_emb, embeddings)[0]
        else:
            all_scores = torch.matmul(embeddings, query_emb)

        sorted_indices = torch.argsort(all_scores, descending=True).cpu().numpy()

        for k in top_ks:
            retrieved_ids = sorted_indices[:k]
            hit = gt_idx in retrieved_ids
            scores[f"recall@{k}"].append(int(hit))
            scores[f"precision@{k}"].append(1.0 / k if hit else 0.0)

            rank = np.where(sorted_indices == gt_idx)[0]
            rank = rank[0] + 1 if len(rank) else 0
            mrr = 1.0 / rank if rank > 0 else 0.0
            scores[f"mrr@{k}"].append(mrr)

    summary = {k: np.mean(v) for k, v in scores.items()}
    summary["avg_gt_sim"] = np.mean(sims)
    return summary

In [None]:
def run_eval_from_csv(csv_path: Path, model_name="all-mpnet-base-v2", top_ks: List[int] = [1, 3, 5, 10], similarity="cosine", device="cpu") -> dict:
    df = pd.read_csv(csv_path)
    print(f"\n📂 Chargement {csv_path.name} ({len(df)} lignes brutes)...")

    # ✅ Nettoyage obligatoire
    df = df.dropna(subset=["chunk", "question"])
    df = df[df["chunk"].str.strip() != ""]
    df = df[df["question"].str.strip() != ""]
    df = df.reset_index(drop=True)  # 🔥 Corrige l’IndexError

    print(f"✅ {len(df)} lignes après nettoyage")

    # Embeddings
    embeddings, model = generate_embeddings(df["chunk"].tolist(), model_name, device=device)

    # Évaluation enrichie
    results = evaluate_retrieval(df, embeddings, model, top_ks=top_ks, similarity=similarity, device=device)
    print(f"✅ {csv_path.name} : {', '.join([f'{k}={v:.3f}' for k, v in results.items()])}")
    return {"config": csv_path.stem.replace("questions_", ""), **results}


In [None]:
def evaluate_all_configs(path_type: str, csv_path: Path, model_name="all-mpnet-base-v2",
                         top_ks: List[int] = [1, 3, 5, 10], similarity="cosine", device="cpu") -> pd.DataFrame:
    if path_type == 'dir':
        csv_files = sorted(csv_path.glob("questions_cs*.csv"))
    elif path_type == 'file':
        csv_files = [csv_path]
    else:
        raise ValueError("path_type must be 'dir' or 'file'.")

    all_results = []
    for csv_file in csv_files:
        result = run_eval_from_csv(csv_file, model_name=model_name, top_ks=top_ks, similarity=similarity, device=device)
        all_results.append(result)

    return pd.DataFrame(all_results)

In [None]:
# csv_file = Path(r"/content/questions_cs1024_ov256.csv")
# csv_dir = Path(r"/content/data")

# # Évaluation
# results_df = evaluate_all_configs(
#     path_type='dir',
#     csv_path=csv_dir,
#     model_name="all-mpnet-base-v2",
#     top_ks=[3,5,7,10,15,20],
#     similarity="cosine",
#     device="cuda"  # ou "cuda" si tu es sûr que le GPU est dispo
# )

# # Affichage final
# results_df.sort_values("recall@10", ascending=False)

In [None]:
from pathlib import Path

csv_dir = Path("/content/data")

models_to_test = [
    "all-mpnet-base-v2",
    "all-MiniLM-L6-v2",
    "distiluse-base-multilingual-cased-v2",
    "BAAI/bge-base-en-v1.5",
    "sentence-transformers/gtr-t5-base"
]

results = []

for model_name in models_to_test:
    print(f"\n🔍 Évaluation avec le modèle : {model_name}")
    df_results = evaluate_all_configs(
        path_type='dir',
        csv_path=csv_dir,
        model_name=model_name,
        top_ks=[5, 7, 10, 15, 20],
        similarity="cosine",
        device="cuda"
    )
    df_results["embedding_model"] = model_name
    results.append(df_results)

df_all_models = pd.concat(results, ignore_index=True)



🔍 Évaluation avec le modèle : all-mpnet-base-v2

📂 Chargement questions_cs1024_ov256.csv (160 lignes brutes)...
✅ 159 lignes après nettoyage


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ questions_cs1024_ov256.csv : recall@5=0.786, recall@7=0.818, recall@10=0.843, recall@15=0.893, recall@20=0.925, precision@5=0.157, precision@7=0.117, precision@10=0.084, precision@15=0.060, precision@20=0.046, mrr@5=0.681, mrr@7=0.681, mrr@10=0.681, mrr@15=0.681, mrr@20=0.681, avg_gt_sim=0.684

📂 Chargement questions_cs256_ov64.csv (618 lignes brutes)...
✅ 618 lignes après nettoyage


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ questions_cs256_ov64.csv : recall@5=0.694, recall@7=0.728, recall@10=0.761, recall@15=0.798, recall@20=0.820, precision@5=0.139, precision@7=0.104, precision@10=0.076, precision@15=0.053, precision@20=0.041, mrr@5=0.552, mrr@7=0.552, mrr@10=0.552, mrr@15=0.552, mrr@20=0.552, avg_gt_sim=0.687

📂 Chargement questions_cs512_ov128.csv (309 lignes brutes)...
✅ 309 lignes après nettoyage


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ questions_cs512_ov128.csv : recall@5=0.715, recall@7=0.764, recall@10=0.806, recall@15=0.848, recall@20=0.867, precision@5=0.143, precision@7=0.109, precision@10=0.081, precision@15=0.057, precision@20=0.043, mrr@5=0.589, mrr@7=0.589, mrr@10=0.589, mrr@15=0.589, mrr@20=0.589, avg_gt_sim=0.669

📂 Chargement questions_cs768_ov128.csv (207 lignes brutes)...
✅ 207 lignes après nettoyage


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ questions_cs768_ov128.csv : recall@5=0.783, recall@7=0.797, recall@10=0.841, recall@15=0.870, recall@20=0.894, precision@5=0.157, precision@7=0.114, precision@10=0.084, precision@15=0.058, precision@20=0.045, mrr@5=0.662, mrr@7=0.662, mrr@10=0.662, mrr@15=0.662, mrr@20=0.662, avg_gt_sim=0.684

🔍 Évaluation avec le modèle : all-MiniLM-L6-v2

📂 Chargement questions_cs1024_ov256.csv (160 lignes brutes)...
✅ 159 lignes après nettoyage


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ questions_cs1024_ov256.csv : recall@5=0.780, recall@7=0.818, recall@10=0.855, recall@15=0.874, recall@20=0.918, precision@5=0.156, precision@7=0.117, precision@10=0.086, precision@15=0.058, precision@20=0.046, mrr@5=0.702, mrr@7=0.702, mrr@10=0.702, mrr@15=0.702, mrr@20=0.702, avg_gt_sim=0.653

📂 Chargement questions_cs256_ov64.csv (618 lignes brutes)...
✅ 618 lignes après nettoyage


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ questions_cs256_ov64.csv : recall@5=0.759, recall@7=0.801, recall@10=0.824, recall@15=0.846, recall@20=0.864, precision@5=0.152, precision@7=0.114, precision@10=0.082, precision@15=0.056, precision@20=0.043, mrr@5=0.633, mrr@7=0.633, mrr@10=0.633, mrr@15=0.633, mrr@20=0.633, avg_gt_sim=0.658

📂 Chargement questions_cs512_ov128.csv (309 lignes brutes)...
✅ 309 lignes après nettoyage


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ questions_cs512_ov128.csv : recall@5=0.744, recall@7=0.764, recall@10=0.783, recall@15=0.848, recall@20=0.858, precision@5=0.149, precision@7=0.109, precision@10=0.078, precision@15=0.057, precision@20=0.043, mrr@5=0.642, mrr@7=0.642, mrr@10=0.642, mrr@15=0.642, mrr@20=0.642, avg_gt_sim=0.637

📂 Chargement questions_cs768_ov128.csv (207 lignes brutes)...
✅ 207 lignes après nettoyage


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ questions_cs768_ov128.csv : recall@5=0.816, recall@7=0.831, recall@10=0.855, recall@15=0.870, recall@20=0.879, precision@5=0.163, precision@7=0.119, precision@10=0.086, precision@15=0.058, precision@20=0.044, mrr@5=0.721, mrr@7=0.721, mrr@10=0.721, mrr@15=0.721, mrr@20=0.721, avg_gt_sim=0.644

🔍 Évaluation avec le modèle : distiluse-base-multilingual-cased-v2

📂 Chargement questions_cs1024_ov256.csv (160 lignes brutes)...
✅ 159 lignes après nettoyage


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ questions_cs1024_ov256.csv : recall@5=0.635, recall@7=0.686, recall@10=0.717, recall@15=0.774, recall@20=0.818, precision@5=0.127, precision@7=0.098, precision@10=0.072, precision@15=0.052, precision@20=0.041, mrr@5=0.580, mrr@7=0.580, mrr@10=0.580, mrr@15=0.580, mrr@20=0.580, avg_gt_sim=0.446

📂 Chargement questions_cs256_ov64.csv (618 lignes brutes)...
✅ 618 lignes après nettoyage


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ questions_cs256_ov64.csv : recall@5=0.555, recall@7=0.604, recall@10=0.633, recall@15=0.668, recall@20=0.699, precision@5=0.111, precision@7=0.086, precision@10=0.063, precision@15=0.045, precision@20=0.035, mrr@5=0.463, mrr@7=0.463, mrr@10=0.463, mrr@15=0.463, mrr@20=0.463, avg_gt_sim=0.443

📂 Chargement questions_cs512_ov128.csv (309 lignes brutes)...
✅ 309 lignes après nettoyage


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ questions_cs512_ov128.csv : recall@5=0.602, recall@7=0.663, recall@10=0.699, recall@15=0.751, recall@20=0.773, precision@5=0.120, precision@7=0.095, precision@10=0.070, precision@15=0.050, precision@20=0.039, mrr@5=0.529, mrr@7=0.529, mrr@10=0.529, mrr@15=0.529, mrr@20=0.529, avg_gt_sim=0.441

📂 Chargement questions_cs768_ov128.csv (207 lignes brutes)...
✅ 207 lignes après nettoyage


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ questions_cs768_ov128.csv : recall@5=0.643, recall@7=0.696, recall@10=0.749, recall@15=0.773, recall@20=0.797, precision@5=0.129, precision@7=0.099, precision@10=0.075, precision@15=0.052, precision@20=0.040, mrr@5=0.519, mrr@7=0.519, mrr@10=0.519, mrr@15=0.519, mrr@20=0.519, avg_gt_sim=0.429

🔍 Évaluation avec le modèle : BAAI/bge-base-en-v1.5

📂 Chargement questions_cs1024_ov256.csv (160 lignes brutes)...
✅ 159 lignes après nettoyage


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ questions_cs1024_ov256.csv : recall@5=0.818, recall@7=0.862, recall@10=0.868, recall@15=0.887, recall@20=0.925, precision@5=0.164, precision@7=0.123, precision@10=0.087, precision@15=0.059, precision@20=0.046, mrr@5=0.708, mrr@7=0.708, mrr@10=0.708, mrr@15=0.708, mrr@20=0.708, avg_gt_sim=0.806

📂 Chargement questions_cs256_ov64.csv (618 lignes brutes)...
✅ 618 lignes après nettoyage


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ questions_cs256_ov64.csv : recall@5=0.791, recall@7=0.819, recall@10=0.835, recall@15=0.869, recall@20=0.879, precision@5=0.158, precision@7=0.117, precision@10=0.083, precision@15=0.058, precision@20=0.044, mrr@5=0.667, mrr@7=0.667, mrr@10=0.667, mrr@15=0.667, mrr@20=0.667, avg_gt_sim=0.811

📂 Chargement questions_cs512_ov128.csv (309 lignes brutes)...
✅ 309 lignes après nettoyage


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ questions_cs512_ov128.csv : recall@5=0.790, recall@7=0.828, recall@10=0.871, recall@15=0.896, recall@20=0.916, precision@5=0.158, precision@7=0.118, precision@10=0.087, precision@15=0.060, precision@20=0.046, mrr@5=0.676, mrr@7=0.676, mrr@10=0.676, mrr@15=0.676, mrr@20=0.676, avg_gt_sim=0.795

📂 Chargement questions_cs768_ov128.csv (207 lignes brutes)...
✅ 207 lignes après nettoyage


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ questions_cs768_ov128.csv : recall@5=0.816, recall@7=0.826, recall@10=0.889, recall@15=0.894, recall@20=0.903, precision@5=0.163, precision@7=0.118, precision@10=0.089, precision@15=0.060, precision@20=0.045, mrr@5=0.722, mrr@7=0.722, mrr@10=0.722, mrr@15=0.722, mrr@20=0.722, avg_gt_sim=0.798

🔍 Évaluation avec le modèle : sentence-transformers/gtr-t5-base

📂 Chargement questions_cs1024_ov256.csv (160 lignes brutes)...
✅ 159 lignes après nettoyage


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ questions_cs1024_ov256.csv : recall@5=0.805, recall@7=0.830, recall@10=0.855, recall@15=0.881, recall@20=0.899, precision@5=0.161, precision@7=0.119, precision@10=0.086, precision@15=0.059, precision@20=0.045, mrr@5=0.674, mrr@7=0.674, mrr@10=0.674, mrr@15=0.674, mrr@20=0.674, avg_gt_sim=0.773

📂 Chargement questions_cs256_ov64.csv (618 lignes brutes)...
✅ 618 lignes après nettoyage


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ questions_cs256_ov64.csv : recall@5=0.749, recall@7=0.780, recall@10=0.814, recall@15=0.848, recall@20=0.867, precision@5=0.150, precision@7=0.111, precision@10=0.081, precision@15=0.057, precision@20=0.043, mrr@5=0.608, mrr@7=0.608, mrr@10=0.608, mrr@15=0.608, mrr@20=0.608, avg_gt_sim=0.783

📂 Chargement questions_cs512_ov128.csv (309 lignes brutes)...
✅ 309 lignes après nettoyage


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ questions_cs512_ov128.csv : recall@5=0.764, recall@7=0.793, recall@10=0.816, recall@15=0.838, recall@20=0.858, precision@5=0.153, precision@7=0.113, precision@10=0.082, precision@15=0.056, precision@20=0.043, mrr@5=0.610, mrr@7=0.610, mrr@10=0.610, mrr@15=0.610, mrr@20=0.610, avg_gt_sim=0.768

📂 Chargement questions_cs768_ov128.csv (207 lignes brutes)...
✅ 207 lignes après nettoyage


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ questions_cs768_ov128.csv : recall@5=0.792, recall@7=0.831, recall@10=0.850, recall@15=0.884, recall@20=0.894, precision@5=0.158, precision@7=0.119, precision@10=0.085, precision@15=0.059, precision@20=0.045, mrr@5=0.666, mrr@7=0.666, mrr@10=0.666, mrr@15=0.666, mrr@20=0.666, avg_gt_sim=0.769


In [None]:
# Concaténation des résultats
df_all_models = pd.concat(results, ignore_index=True)

# 🔽 Export CSV final
export_path = "/content/retrieval_comparaison_embeddings.csv"
df_all_models.to_csv(export_path, index=False)
print(f"\n✅ Résultats enregistrés dans : {export_path}")


✅ Résultats enregistrés dans : /content/retrieval_comparaison_embeddings.csv


#

# The best parameters are:


# save best parameters


In [None]:
!pip install -q sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import pickle
import os

In [None]:
# Paramètres
embedding_model_name = "BAAI/bge-base-en-v1.5"
embedding_file = "embeddings_bge_v15.pt"
data_file = "retrieval_data_bge_v15.pkl"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# 🔹 Chargement du DataFrame contenant les colonnes :
# chunk, question, lang, generated_answer, short_answer, generated_answer_chatgpt, short_answer_chatgpt
df = pd.read_csv(r"/content/answers_with_chatgpt_160.csv")  # 🔁 Remplace par le chemin réel de ton fichier CSV

In [None]:
if os.path.exists(embedding_file) and os.path.exists(data_file):
    print("📥 Chargement des embeddings et du DataFrame depuis les fichiers...")
    embeddings = torch.load(embedding_file, map_location=device)

    with open(data_file, "rb") as f:
        df = pickle.load(f)

else:
    print("⚙️ Génération des embeddings avec BAAI/bge-base-en-v1.5 ...")
    model = SentenceTransformer(embedding_model_name, device=device)

    # ✅ Récupérer les chunks
    chunks = df["chunk"].fillna("").astype(str).tolist()

    # 💡 Encode avec normalisation pour retrieval optimal (préconisée pour BGE)
    embeddings = model.encode(
        chunks,
        convert_to_tensor=True,
        normalize_embeddings=True,
        show_progress_bar=True,
        batch_size=64,
        device=device
    )

    # 💾 Sauvegarde
    torch.save(embeddings, embedding_file)
    with open(data_file, "wb") as f:
        pickle.dump(df, f)

    print("✅ Embeddings et DataFrame sauvegardés.")

# 📊 Vérification
print("✅ Embeddings shape :", embeddings.shape)
print("✅ Exemple de chunk :", df["chunk"].iloc[0][:100], "...")

⚙️ Génération des embeddings avec BAAI/bge-base-en-v1.5 ...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Embeddings et DataFrame sauvegardés.
✅ Embeddings shape : torch.Size([160, 768])
✅ Exemple de chunk : Perspective Non-electroencephalogram-based seizure detection devices: State of the art and future pe ...


# Optimize generation

In [None]:
!pip install -q rouge-score bert-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import pickle
import pandas as pd
import random
from sentence_transformers import SentenceTransformer, util
import openai

In [None]:
embedding_file = "embeddings_bge_v15.pt"
data_file = "retrieval_data_bge_v15.pkl"
mistral_api_key = "MPRM6bmDVyAwAP7wNmXOebdCffFdBpLc"  # ❌ remplace par ta clé API Mistral
mistral_model = "mistral-small"
top_k = 10
sample_frac = 0.1
prompt_style = "complet"  # ou "court"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# 📥 Chargement des embeddings et des données
embeddings = torch.load(embedding_file, map_location=device)
with open(data_file, "rb") as f:
    df = pickle.load(f)

In [None]:
# 🔍 Fonction pour extraire les top-k chunks pour une question
def get_top_k_chunks(question, all_chunks, model, k=10):
    question_emb = model.encode(question, convert_to_tensor=True, normalize_embeddings=True, device=device)
    chunk_embs = model.encode(all_chunks, convert_to_tensor=True, normalize_embeddings=True, device=device)
    scores = util.cos_sim(question_emb, chunk_embs)[0]
    top_k_idx = torch.topk(scores, k=k).indices
    return [all_chunks[i] for i in top_k_idx]

# 📃 Fonction pour construire un prompt

In [None]:
def build_prompt_mistral(question, chunks, style="complet"):
    context = "\n\n".join([f"[{i+1}] {c.strip()}" for i, c in enumerate(chunks)])
    if style == "court":
        instruction = "Réponds de manière courte et factuelle à la question suivante, uniquement en utilisant les informations ci-dessous. Si la réponse ne peut pas être trouvée, dis 'Je ne sais pas'."
    else:
        instruction = "Réponds de manière claire et complète à la question suivante, uniquement en utilisant les informations ci-dessous. Ne fais pas d'invention."
    return f"""{instruction}

Contextes :
{context}

Question : {question}

Réponse :"""

In [None]:
def call_mistral(prompt, api_key, model="mistral-small", temperature=0.2, max_tokens=512):
    openai.api_base = "https://api.mistral.ai/v1"
    openai.api_key = api_key
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "Tu es un assistant expert."},
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response['choices'][0]['message']['content'].strip()


In [None]:
# 📂 Sous-échantillonnage de 10% du DataFrame
sampled_df = df.sample(frac=sample_frac, random_state=42).reset_index(drop=True)

# 🔄 Boucle d'évaluation
results = []
all_chunks = df["chunk"].fillna("").astype(str).tolist()

for i, row in sampled_df.iterrows():
    question = row["question"]
    top_chunks = get_top_k_chunks(question, all_chunks, model, k=top_k)
    prompt = build_prompt_mistral(question, top_chunks, style=prompt_style)
    try:
        response = call_mistral(prompt, mistral_api_key, model=mistral_model)
    except Exception as e:
        response = f"[ERREUR] {e}"² ²

    results.append({
        "question": question,
        "expected_short_answer": row["short_answer"],
        "expected_answer": row["generated_answer"],
        "mistral_answer": response
    })

# 🔢 Conversion en DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv("mistral_eval_sample.csv", index=False)
print("✅ Évaluation terminée. Résultats sauvegardés dans 'mistral_eval_sample.csv'")


# vérification

In [None]:
pip install sentence-transformers pandas scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# 🔍 Charger ton DataFrame contenant les réponses et les contextes utilisés
df = pd.read_csv(r"/content/mistral_eval_sample (1).csv")  # ou un autre CSV
print("✅ Données chargées.")

✅ Données chargées.


In [None]:


# ⚙️ Initialiser le modèle d'encodage (le même que pour ton retrieval)
model = SentenceTransformer("BAAI/bge-base-en-v1.5")  # ou "all-MiniLM-L6-v2" si plus léger

# 🧠 Encoder les réponses mistral et les chunks
similarities = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    answer = str(row["mistral_answer"])
    chunks = str(row["top_chunks"]) if "top_chunks" in df.columns else ""  # adapte le nom si nécessaire

    if not chunks.strip() or not answer.strip():
        similarities.append(None)
        continue

    emb_answer = model.encode(answer, convert_to_tensor=True)
    emb_context = model.encode(chunks, convert_to_tensor=True)

    sim = util.cos_sim(emb_answer, emb_context).item()
    similarities.append(sim)

# ➕ Ajouter la similarité au DataFrame
df["cosine_similarity_answer_vs_context"] = similarities

# 💾 Sauvegarde
df.to_csv("mistral_eval_with_cosine.csv", index=False)
print("✅ Cosine similarity ajoutée au fichier 'mistral_eval_with_cosine.csv'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 16/16 [00:00<00:00, 3724.13it/s]

✅ Cosine similarity ajoutée au fichier 'mistral_eval_with_cosine.csv'





In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

# Charger les fichiers nécessaires
eval_df = pd.read_csv(r"/content/mistral_eval_sample_with_bertscore (1).csv")
retrieval_data = pd.read_pickle(r"/content/retrieval_data_bge_v15.pkl")

In [None]:


# Recréer le mapping question → top-k chunks concaténés
chunk_mapping = retrieval_data.set_index("question")["chunk"].to_dict()
eval_df["top_chunks_concat"] = eval_df["question"].map(chunk_mapping)

# Charger le modèle sur CPU
encoder = SentenceTransformer("BAAI/bge-base-en-v1.5", device="cuda")

# Calcul cosine similarity entre réponse mistral et top chunks
cosine_similarities = []

for _, row in eval_df.iterrows():
    mistral_answer = row["mistral_answer"]
    top_chunks_text = row["top_chunks_concat"]
    if pd.isna(mistral_answer) or pd.isna(top_chunks_text):
        cosine_similarities.append(None)
        continue
    emb_answer = encoder.encode(mistral_answer, convert_to_tensor=True, normalize_embeddings=True)
    emb_chunks = encoder.encode(top_chunks_text, convert_to_tensor=True, normalize_embeddings=True)
    cos_sim = util.cos_sim(emb_answer, emb_chunks).item()
    cosine_similarities.append(cos_sim)

# Ajouter au DataFrame
eval_df["cosine_similarity_mistral_vs_chunks"] = cosine_similarities

# Sauvegarder le fichier enrichi
eval_df.to_csv("/content/mistral_eval_sample_with_cosine.csv", index=False)
print("✅ Cosine similarity ajoutée au fichier.")


✅ Cosine similarity ajoutée au fichier.


# with reranking

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import pandas as pd
import torch

# 📁 Fichiers
eval_file = r"/content/mistral_eval_sample_with_bertscore (1).csv"
retrieval_file = r"/content/answers_cs1024_ov256_mistral.csv"

# 📥 Charger les données
eval_df = pd.read_csv(eval_file)
retrieval_df = pd.read_csv(retrieval_file)

# 🔄 Regrouper tous les chunks par question
grouped_chunks = retrieval_df.groupby("question")["chunk"].apply(list).reset_index()
grouped_chunks.rename(columns={"chunk": "retrieved_chunks_list"}, inplace=True)

# 🔗 Fusion avec le fichier d'évaluation
eval_df = eval_df.merge(grouped_chunks, on="question", how="left")

# 🤖 Chargement des modèles
device = "cuda" if torch.cuda.is_available() else "cpu"
encoder = SentenceTransformer("BAAI/bge-base-en-v1.5", device=device)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=device)

# 🧠 Reranking + calcul cosine similarity
reranked_contexts = []
cosine_similarities = []

for _, row in eval_df.iterrows():
    question = row["question"]
    answer = row["mistral_answer"]
    chunk_list = row["retrieved_chunks_list"]

    # Vérification données valides
    if pd.isna(answer) or not isinstance(chunk_list, list) or not chunk_list:
        reranked_contexts.append("")
        cosine_similarities.append(None)
        continue

    # 🔁 Étape 1 : Reranker les chunks pour cette question
    rerank_inputs = [(question, chunk) for chunk in chunk_list]
    scores = reranker.predict(rerank_inputs)

    # Garder les top-5 (ou top-k si tu préfères)
    top_k = 5
    ranked_chunks = [chunk for _, chunk in sorted(zip(scores, chunk_list), key=lambda x: x[0], reverse=True)[:top_k]]
    reranked_text = " ".join(ranked_chunks)
    reranked_contexts.append(reranked_text)

    # ✳️ Étape 2 : Similarité entre réponse et contexte reranké
    emb_answer = encoder.encode(answer, convert_to_tensor=True, normalize_embeddings=True)
    emb_context = encoder.encode(reranked_text, convert_to_tensor=True, normalize_embeddings=True)
    sim = util.cos_sim(emb_answer, emb_context).item()
    cosine_similarities.append(sim)

# ➕ Ajouter au DataFrame
eval_df["reranked_chunks_concat"] = reranked_contexts
eval_df["cosine_similarity_mistral_vs_reranked"] = cosine_similarities

# 📊 Taux de grounding
threshold = 0.7
valid = eval_df["cosine_similarity_mistral_vs_reranked"].dropna()
grounded = (valid > threshold).sum()
total = valid.count()
percentage = 100 * grounded / total

print(f"✅ Réponses grounded (cosine > {threshold}) : {grounded}/{total} soit {percentage:.2f}%")

# 💾 Sauvegarde
eval_df.to_csv(r"/content/mistral_eval_with_reranking_and_grounding.csv", index=False)
print("💾 Fichier sauvegardé : /content/mistral_eval_with_reranking_and_grounding.csv")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ Réponses grounded (cosine > 0.7) : 15/16 soit 93.75%
💾 Fichier sauvegardé : /content/mistral_eval_with_reranking_and_grounding.csv


# eval 1

In [None]:
!pip install -q openai --upgrade
!pip install -q sentence-transformers

import torch
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI

# 🔧 Paramètres
embedding_file = "/content/embeddings_bge_v15.pt"
data_file = "/content/retrieval_data_bge_v15.pkl"
mistral_api_key = "your-mistral-api-key"  # 🔁 Remplace par ta clé API Mistral
mistral_model = "mistral-small"
top_k = 10
sample_frac = 0.1
prompt_style = "complet"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 📥 Chargement des données
embeddings = torch.load(embedding_file, map_location=device)
with open(data_file, "rb") as f:
    df = pickle.load(f)

# 🌐 Initialisation client OpenAI compatible Mistral
client = OpenAI(
    base_url="https://api.mistral.ai/v1",
    api_key=mistral_api_key
)

# 🔍 Modèle d'embedding
retrieval_model = SentenceTransformer("BAAI/bge-base-en-v1.5", device=device)

# 🔎 Fonction : top-k chunks

def get_top_k_chunks(question, all_chunks, model, k=10):
    question_emb = model.encode(question, convert_to_tensor=True, normalize_embeddings=True, device=device)
    chunk_embs = model.encode(all_chunks, convert_to_tensor=True, normalize_embeddings=True, device=device)
    scores = util.cos_sim(question_emb, chunk_embs)[0]
    top_k_idx = torch.topk(scores, k=k).indices
    return [all_chunks[i] for i in top_k_idx]

# 🧠 Fonction : prompt engineering

def build_prompt_mistral(question, chunks, style="complet"):
    context = "\n\n".join([f"[{i+1}] {c.strip()}" for i, c in enumerate(chunks)])
    if style == "court":
        instruction = "Réponds de manière courte et factuelle à la question suivante, uniquement en utilisant les informations ci-dessous. Si la réponse ne peut pas être trouvée, dis 'Je ne sais pas'."
    else:
        instruction = "Réponds de manière claire et complète à la question suivante, uniquement en utilisant les informations ci-dessous. Ne fais pas d'invention."
    return f"""{instruction}

Contextes :
{context}

Question : {question}

Réponse :"""

# 🚀 Fonction : appel API Mistral

def call_mistral(prompt, model="mistral-small", temperature=0.2, max_tokens=512):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Tu es un assistant expert."},
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

# 📊 Préparation du sous-échantillon
sampled_df = df.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
all_chunks = df["chunk"].fillna("").astype(str).tolist()

# 🔁 Génération
results = []
for i, row in sampled_df.iterrows():
    question = row["question"]
    top_chunks = get_top_k_chunks(question, all_chunks, retrieval_model, k=top_k)
    prompt = build_prompt_mistral(question, top_chunks, style=prompt_style)
    try:
        response = call_mistral(prompt, model=mistral_model)
    except Exception as e:
        response = f"[ERREUR] {e}"

    results.append({
        "question": question,
        "expected_short_answer": row["short_answer"],
        "expected_answer": row["generated_answer"],
        "mistral_answer": response
    })

# 💾 Sauvegarde CSV
results_df = pd.DataFrame(results)
results_df.to_csv("mistral_eval_sample.csv", index=False)
print("✅ Génération terminée. Résultats dans 'mistral_eval_sample.csv'")

# bertscore

In [None]:
!pip install -q openai --upgrade
!pip install -q sentence-transformers

import torch
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI

# 🔧 Paramètres
embedding_file = "/content/embeddings_bge_v15.pt"
data_file = "/content/retrieval_data_bge_v15.pkl"
mistral_api_key = "your-mistral-api-key"  # 🔁 Remplace par ta clé API Mistral
mistral_model = "mistral-small"
top_k = 10
sample_frac = 0.1
prompt_style = "complet"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 📥 Chargement des données
embeddings = torch.load(embedding_file, map_location=device)
with open(data_file, "rb") as f:
    df = pickle.load(f)

# 🌐 Initialisation client OpenAI compatible Mistral
client = OpenAI(
    base_url="https://api.mistral.ai/v1",
    api_key=mistral_api_key
)

# 🔍 Modèle d'embedding
retrieval_model = SentenceTransformer("BAAI/bge-base-en-v1.5", device=device)

# 🔎 Fonction : top-k chunks

def get_top_k_chunks(question, all_chunks, model, k=10):
    question_emb = model.encode(question, convert_to_tensor=True, normalize_embeddings=True, device=device)
    chunk_embs = model.encode(all_chunks, convert_to_tensor=True, normalize_embeddings=True, device=device)
    scores = util.cos_sim(question_emb, chunk_embs)[0]
    top_k_idx = torch.topk(scores, k=k).indices
    return [all_chunks[i] for i in top_k_idx]

# 🧠 Fonction : prompt engineering

def build_prompt_mistral(question, chunks, style="complet"):
    context = "\n\n".join([f"[{i+1}] {c.strip()}" for i, c in enumerate(chunks)])
    if style == "court":
        instruction = "Réponds de manière courte et factuelle à la question suivante, uniquement en utilisant les informations ci-dessous. Si la réponse ne peut pas être trouvée, dis 'Je ne sais pas'."
    else:
        instruction = "Réponds de manière claire et complète à la question suivante, uniquement en utilisant les informations ci-dessous. Ne fais pas d'invention."
    return f"""{instruction}

Contextes :
{context}

Question : {question}

Réponse :"""

# 🚀 Fonction : appel API Mistral

def call_mistral(prompt, model="mistral-small", temperature=0.2, max_tokens=512):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Tu es un assistant expert."},
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

# 📊 Préparation du sous-échantillon
sampled_df = df.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
all_chunks = df["chunk"].fillna("").astype(str).tolist()

# 🔁 Génération
results = []
for i, row in sampled_df.iterrows():
    question = row["question"]
    top_chunks = get_top_k_chunks(question, all_chunks, retrieval_model, k=top_k)
    prompt = build_prompt_mistral(question, top_chunks, style=prompt_style)
    try:
        response = call_mistral(prompt, model=mistral_model)
    except Exception as e:
        response = f"[ERREUR] {e}"

    results.append({
        "question": question,
        "expected_short_answer": row["short_answer"],
        "expected_answer": row["generated_answer"],
        "mistral_answer": response
    })

# 💾 Sauvegarde CSV
results_df = pd.DataFrame(results)
results_df.to_csv("mistral_eval_sample.csv", index=False)
print("✅ Génération terminée. Résultats dans 'mistral_eval_sample.csv'")