In [None]:
# Eval data link: https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/dataset_eval.zip
!curl -L -o ../data/dataset_eval.zip https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/dataset_eval.zip
!unzip -d ../data/eval ../data/dataset_eval.zip
!rm ../data/dataset_eval.zip

# Create database

In [None]:
import sys
sys.path.append("../src")

from rag_alm_assistant.ingestion import full_ingestion_pipeline
from rag_alm_assistant.constants import DIC_DIR, VECTORSTORE_DIR, EMBEDDING_MODEL_NAME

vector_store, raw_docs, chunks = full_ingestion_pipeline(
    dic_dir="../data/DIC",
    persist_directory="../data/vector_store",
    model_name=EMBEDDING_MODEL_NAME,
)

print("Nb docs bruts :", len(raw_docs))
print("Nb chunks     :", len(chunks))
print("Vector store  :", VECTORSTORE_DIR)

# Health check

In [None]:
# Check how ert score works
from bert_score import score

# 1) même phrase => score proche de 1
refs = ["L’OPCVM est un fonds d’investissement collectif en valeurs mobilières."]
preds = ["L’OPCVM est un fonds d’investissement collectif en valeurs mobilières."]

P, R, F1 = score(preds, refs, lang="fr")
print("Identiques F1:", F1.mean().item())

# 2) phrases proches => score élevé
refs2 = ["L’OPCVM permet un investissement collectif sur les marchés financiers."]
preds2 = ["Un OPCVM est un produit qui permet d’investir collectivement en titres financiers."]

P2, R2, F12 = score(preds2, refs2, lang="fr", rescale_with_baseline=True)
print("Semblables F1:", F12.mean().item())

In [None]:
## Check few samples of answers:
import json
from pathlib import Path
import sys
sys.path.append("../src")
from rag_alm_assistant.orchestrator import RAGOrchestrator
import random
from bert_score import score

EVAL_DIR = Path("../data/eval")
with open(EVAL_DIR / "queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)
with open(EVAL_DIR / "answers.json", "r", encoding="utf-8") as f:
    gold_answers = json.load(f)

orchestrator = RAGOrchestrator(use_reranker = True, use_memory= False, k_rerank = 5, k = 10)

sample_uuids = random.sample(list(queries.keys()), 20)
preds = []  # answer from our pipeline
refs = []   # references

for uid in sample_uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    pred_answer, pred_sources = orchestrator.ask(q)
    preds.append(pred_answer)
    refs.append(ref_answer)

    print("==== UID:", uid, "====")
    print("Question :", q)
    print("\nGold answer :\n", ref_answer)
    print("\nPred answer :\n", pred_answer)
    print("\nSources prédictes :", pred_sources)
    print("\n" + "="*80 + "\n")

# Calcul BERTScore
P, R, F1 = score(
    cands=preds,
    refs=refs,
    lang="fr"
)

f1_mean = F1.mean().item()
print(f"Mean BERTScore F1: {f1_mean:.4f}  ({f1_mean*100:.2f}%)")

# Evaluation without reranker

In [None]:
import json
from pathlib import Path
import sys
sys.path.append("../src")
from rag_alm_assistant.orchestrator import RAGOrchestrator

from bert_score import score

EVAL_DIR = Path("../data/eval")

with open(EVAL_DIR / "queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

with open(EVAL_DIR / "answers.json", "r", encoding="utf-8") as f:
    gold_answers = json.load(f)

with open(EVAL_DIR / "relevant_docs.json", "r", encoding="utf-8") as f:
    gold_sources = json.load(f)

uuids = list(queries.keys())
orchestrator = RAGOrchestrator(use_reranker = False, use_memory= False)

preds = []  # answer from our pipeline
refs = []   # references

for uid in uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    ref_sources = gold_sources[uid]

    pred_answer, pred_source = orchestrator.ask(q)
    # print("\nSources prédictes :", pred_source)

    preds.append(pred_answer)
    refs.append(ref_answer)

In [None]:
# Calcul BERTScore
P, R, F1 = score(
    cands=preds,
    refs=refs,
    lang = "fr",
    rescale_with_baseline=True
)

f1_mean = F1.mean().item()
print(f"Mean BERTScore F1: {f1_mean:.4f}  ({f1_mean*100:.2f}%)")

# Evaluation with reranker

In [None]:
import json
from pathlib import Path
import sys
sys.path.append("../src")
from rag_alm_assistant.orchestrator import RAGOrchestrator

from bert_score import score

EVAL_DIR = Path("../data/eval")

with open(EVAL_DIR / "queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

with open(EVAL_DIR / "answers.json", "r", encoding="utf-8") as f:
    gold_answers = json.load(f)

with open(EVAL_DIR / "relevant_docs.json", "r", encoding="utf-8") as f:
    gold_sources = json.load(f)

uuids = list(queries.keys())
orchestrator = RAGOrchestrator(use_reranker = True, use_memory=False, k_rerank = 5, k = 10)

preds = []  # answer from our pipeline
refs = []   # references

for uid in uuids:
    q = queries[uid]
    ref_answer = gold_answers[uid]
    ref_sources = gold_sources[uid]

    pred_answer, pred_source = orchestrator.ask(q)

    preds.append(pred_answer)
    refs.append(ref_answer)

In [13]:
# Calcul BERTScore
P, R, F1 = score(
    cands=preds,
    refs=refs,
    lang = "fr"
)

f1_mean = F1.mean().item()
print(f"Mean BERTScore F1: {f1_mean:.4f}  ({f1_mean*100:.2f}%)")



Mean BERTScore F1: 0.7957  (79.57%)
