In [1]:
# Eval data link: https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/dataset_eval.zip
!curl -L -o ../data/dataset_eval.zip https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/dataset_eval.zip
!unzip -d ../data/eval ../data/dataset_eval.zip
!rm ../data/dataset_eval.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  134k  100  134k    0     0   170k      0 --:--:-- --:--:-- --:--:--  170k
Archive:  ../data/dataset_eval.zip
  inflating: ../data/eval/answers.json  
  inflating: ../data/eval/corpus.json  
  inflating: ../data/eval/errors.json  
  inflating: ../data/eval/queries.json  
  inflating: ../data/eval/relevant_docs.json  


# Create database

In [None]:
import sys
sys.path.append("../src")

from rag_alm_assistant.ingestion import full_ingestion_pipeline
from rag_alm_assistant.constants import DIC_DIR, VECTORSTORE_DIR, EMBEDDING_MODEL_NAME

vector_store, raw_docs, chunks = full_ingestion_pipeline(
    dic_dir="../data/DIC",
    persist_directory="../data/vector_store",
    model_name=EMBEDDING_MODEL_NAME,
)

print("Nb docs bruts :", len(raw_docs))
print("Nb chunks     :", len(chunks))
print("Vector store  :", VECTORSTORE_DIR)

# Evaluation without reranker

In [None]:
import json
from pathlib import Path
import sys
sys.path.append("../src")
from rag_alm_assistant.orchestrator import RAGOrchestrator

from bert_score import score

EVAL_DIR = Path("../data/eval")

with open(EVAL_DIR / "queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

with open(EVAL_DIR / "answers.json", "r", encoding="utf-8") as f:
    gold_answers = json.load(f)

with open(EVAL_DIR / "relevant_docs.json", "r", encoding="utf-8") as f:
    gold_sources = json.load(f)

uuids = list(queries.keys())
orchestrator = RAGOrchestrator(use_reranker = False, use_memory= False)

preds = []  # answer from our pipeline
refs = []   # references

for uid in uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    ref_sources = gold_sources[uid]

    pred_answer, pred_source = orchestrator.ask(q)
    # print("\nSources prédictes :", pred_source)

    preds.append(pred_answer)
    refs.append(ref_answer)

  embeddings = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[2025-11-14 09:21:10] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Quel est l'objectif de gestion du FCP décrit dans le document ?


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


[2025-11-14 09:21:16] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Quels critères sont pris en compte dans les décisions d'investissement du FCP ?
[2025-11-14 09:21:19] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: A quoi vise la prise en compte des critères ESG dans la sélection de titres du FCP ?
[2025-11-14 09:21:23] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Quels sont les scénarios mentionnés pour un investissement utilisant un mandataire approprié ?
[2025-11-14 09:21:30] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Quelles sont les dates associées à chaque scénario pour un investissement utilisant un mandataire approprié ?
[2025-11-14 09:21:38] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Que se passe-t-il si Amundi Luxembourg S.A. n’est pas en mesure d’effectuer les versements ?
[2

In [None]:
# Calcul BERTScore
P, R, F1 = score(
    cands=preds,
    refs=refs,
    lang = "fr",
    rescale_with_baseline=True
)

f1_mean = F1.mean().item()
print(f"Mean BERTScore F1: {f1_mean:.4f}  ({f1_mean*100:.2f}%)")

In [4]:
# Check how ert score works
from bert_score import score

# 1) même phrase => score proche de 1
refs = ["L’OPCVM est un fonds d’investissement collectif en valeurs mobilières."]
preds = ["L’OPCVM est un fonds d’investissement collectif en valeurs mobilières."]

P, R, F1 = score(preds, refs, lang="fr", rescale_with_baseline=True)
print("Identiques F1:", F1.mean().item())

# 2) phrases proches => score élevé
refs2 = ["L’OPCVM permet un investissement collectif sur les marchés financiers."]
preds2 = ["Un OPCVM est un produit qui permet d’investir collectivement en titres financiers."]

P2, R2, F12 = score(preds2, refs2, lang="fr", rescale_with_baseline=True)
print("Semblables F1:", F12.mean().item())

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Identiques F1: 1.0




Semblables F1: 0.581328809261322


In [5]:
## Check few samples of answers:
import random
sample_uuids = random.sample(list(queries.keys()), 5)

for uid in sample_uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    pred_answer, pred_sources = orchestrator.ask(q)

    print("==== UID:", uid, "====")
    print("Question :", q)
    print("\nGold answer :\n", ref_answer)
    print("\nPred answer :\n", pred_answer)
    print("\nSources prédictes :", pred_sources)
    print("\n" + "="*80 + "\n")

[2025-11-14 09:02:13] [INFO] [rag_alm_assistant.orchestrator] Generate answer without memory for question: Quelle est l'estimation des coûts de transaction pour ce produit ?
==== UID: 2639266c-e2da-41ab-a706-509415cc4ce6 ====
Question : Quelle est l'estimation des coûts de transaction pour ce produit ?

Gold answer :
 0,36 % de la valeur de votre investissement par an. Il s’agit d’une estimation des coûts d’achat et de vente

Pred answer :
 0,00% de la valeur de votre investissement par an. Il s’agit d’une estimation des coûts encourus lorsque nous achetons et vendons les investissements sous-jacents pour le produit. Le montant réel variera en fonction du volume de nos achats et ventes. 0,00 EUR

Sources prédictes : [{'dic_name': 'KIDPRIIPs_435039_79602_FRA_FRA_20231011.pdf', 'page': 3}, {'dic_name': 'FR0007050570_DIC_FR_20230630.pdf', 'page': 3}, {'dic_name': 'KIDPRIIPs_434909_79585_FRA_FRA_20231011.pdf', 'page': 3}, {'dic_name': 'kid-priips-qs0002908281-fra-fra.pdf', 'page': 3}, {'di

# Evaluation with reranker

In [None]:
import json
from pathlib import Path
import sys
sys.path.append("../src")
from rag_alm_assistant.orchestrator import RAGOrchestrator

from bert_score import score

EVAL_DIR = Path("../data/eval")

with open(EVAL_DIR / "queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

with open(EVAL_DIR / "answers.json", "r", encoding="utf-8") as f:
    gold_answers = json.load(f)

with open(EVAL_DIR / "relevant_docs.json", "r", encoding="utf-8") as f:
    gold_sources = json.load(f)

uuids = list(queries.keys())
orchestrator = RAGOrchestrator(use_reranker = True, use_memory=False)

preds = []  # answer from our pipeline
refs = []   # references

for uid in uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    ref_sources = gold_sources[uid]

    pred_answer, pred_source = orchestrator.ask(q)

    preds.append(pred_answer)
    refs.append(ref_answer)

In [None]:
# Calcul BERTScore
P, R, F1 = score(
    cands=preds,
    refs=refs,
    model_type = "mistralai/Mistral-7B-Instruct-v0.3",
    lang = "fr",
    rescale_with_baseline=True
)

f1_mean = F1.mean().item()
print(f"Mean BERTScore F1: {f1_mean:.4f}  ({f1_mean*100:.2f}%)")