In [None]:
import polars as pl
import glob
import os

from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter

from milvus_haystack import MilvusDocumentStore
from milvus_haystack.milvus_embedding_retriever import MilvusEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

In [None]:
df = pl.read_parquet('hf://datasets/AgentPublic/piaf/plain_text/train-00000-of-00001.parquet')

## Dataset Analysis

In [2]:
print(df.columns)

['id', 'title', 'context', 'question', 'answers']


In [3]:
df.describe()

statistic,id,title,context,question,answers
str,str,str,str,str,f64
"""count""","""3835""","""3835""","""3835""","""3835""",3835.0
"""null_count""","""0""","""0""","""0""","""0""",0.0
"""mean""",,,,,
"""std""",,,,,
"""min""","""p140295201616088""","""6 Heures de Shanghai 2017""","""2012 est sorti en 2012. Son th…","""A cause de qui Emanuele se voi…",
"""25%""",,,,,
"""50%""",,,,,
"""75%""",,,,,
"""max""","""p140295460357824""","""Événement Azolla""","""Étienne Báthory, roi de Pologn…","""à quelle ronde fénix est sorti…",


In [4]:
# Indiquer le nombre de valeur nulle dans chaque colonne
print(df.null_count())

# Indiquer le nombre de lignes
print(df.height)

shape: (1, 5)
┌─────┬───────┬─────────┬──────────┬─────────┐
│ id  ┆ title ┆ context ┆ question ┆ answers │
│ --- ┆ ---   ┆ ---     ┆ ---      ┆ ---     │
│ u32 ┆ u32   ┆ u32     ┆ u32      ┆ u32     │
╞═════╪═══════╪═════════╪══════════╪═════════╡
│ 0   ┆ 0     ┆ 0       ┆ 0        ┆ 0       │
└─────┴───────┴─────────┴──────────┴─────────┘
3835


In [5]:
# keep 2500 rows
df = df.head(2500)
train_df, val_df, test_df = df.random_split([0.7, 0.15, 0.15], seed=42)

In [6]:
from haystack import Document

# Create document lists for each dataset split
train_documents = [
    Document(
        content=row['context'],
        meta={'id': row['id'], 'title': row['title']}
    )
    for row in train_df.to_dicts()
]

val_documents = [
    Document(
        content=row['context'],
        meta={'id': row['id'], 'title': row['title']}
    )
    for row in val_df.to_dicts()
]

test_documents = [
    Document(
        content=row['context'],
        meta={'id': row['id'], 'title': row['title']}
    )
    for row in test_df.to_dicts()
]

# Use training documents for indexing
documents = train_documents

In [7]:
embedding_model_list = [
    "intfloat/multilingual-e5-large-instruct",
    "Lajavaness/bilingual-embedding-large",
    "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1",
]

In [None]:
for i, model in enumerate(embedding_model_list):
    print(f"Model {i+1}: {model}")
    embedder = SentenceTransformersDocumentEmbedder(model = model, trust_remote_code=True)
    embedder.warm_up()

    document_store = MilvusDocumentStore(
        connection_args={"uri": "./milvus.db"},
        drop_old=True,
        collection_name=f"piaf_{i+1}"
    )
    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component("embedder", embedder)
    indexing_pipeline.add_component("writer", DocumentWriter(document_store))
    indexing_pipeline.connect("embedder", "writer")
    indexing_pipeline.run({"documents": documents})
    print(f"Indexed {len(documents)} documents with model {model}")

Model 1: intfloat/multilingual-e5-large-instruct


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Indexed 2500 documents with model intfloat/multilingual-e5-large-instruct
Model 2: Lajavaness/bilingual-embedding-large


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Indexed 2500 documents with model Lajavaness/bilingual-embedding-large
Model 3: HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Indexed 2500 documents with model HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1


In [8]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack import Pipeline

# 1. DocumentStore en mémoire avec BM25
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)

2500

In [9]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.evaluators import DocumentMAPEvaluator, DocumentMRREvaluator, DocumentRecallEvaluator, DocumentNDCGEvaluator
from haystack import Document, Pipeline


for i, model in enumerate(embedding_model_list):
    grounds_truth = []
    retrieval_results_list = []
    print(f"Model {i+1}: {model}")
    embedder = SentenceTransformersTextEmbedder(model=model, trust_remote_code=True, progress_bar=False)
    embedder.warm_up()

    document_store = MilvusDocumentStore(
        connection_args={"uri": "./milvus.db"},
        collection_name=f"piaf_{i+1}"
    )

    retrieval_pipeline = Pipeline()
    retrieval_pipeline.add_component("embedder", embedder)
    retrieval_pipeline.add_component("retriever", MilvusEmbeddingRetriever(document_store=document_store, top_k=3))
    retrieval_pipeline.connect("embedder", "retriever")

    for row in test_df.to_dicts():
        retrieval_results = retrieval_pipeline.run({"embedder": {"text": row["question"]}})
        grounds_truth.append([Document(
            content=row["context"],
            meta={
                "id": row["id"],
                "title": row["title"],
            }
        )])
        retrieval_result = retrieval_results["retriever"]["documents"]
        retrieval_results_list.append(retrieval_result)
    evaluator = Pipeline()
    mrr_evaluator = DocumentMRREvaluator()
    map_evaluator = DocumentMAPEvaluator()
    recall = DocumentRecallEvaluator()
    ndcg = DocumentNDCGEvaluator()
    evaluator.add_component("mrr_evaluator", mrr_evaluator)
    evaluator.add_component("map_evaluator", map_evaluator)
    evaluator.add_component("recall_evaluator", recall)
    evaluator.add_component("ndcg_evaluator", ndcg)
    score = evaluator.run({
        "mrr_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
        "map_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
        "recall_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
        "ndcg_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth}
    }
    )
    print(f"Score for model {model}: ")
    print(f"MRR: {score['mrr_evaluator']['score']}")
    print(f"MAP: {score['map_evaluator']['score']}")
    print(f"Recall: {score['recall_evaluator']['score']}")
    print(f"NDCG: {score['ndcg_evaluator']['score']}")

Model 1: intfloat/multilingual-e5-large-instruct


NameError: name 'MilvusDocumentStore' is not defined

In [None]:
retriever = InMemoryBM25Retriever(document_store=document_store)
retrieval_pipeline = Pipeline()
retrieval_pipeline.add_component("retriever", retriever)


grounds_truth = []
retrieval_results_list = []

for row in test_df.to_dicts():
    retrieval_results = retrieval_pipeline.run({"retriever": {"query": row["question"]}})
    grounds_truth.append([Document(
        content=row["context"],
        meta={
            "id": row["id"],
            "title": row["title"],
        }
    )])
    retrieval_result = retrieval_results["retriever"]["documents"]
    retrieval_results_list.append(retrieval_result)

In [None]:
evaluator = Pipeline()
mrr_evaluator = DocumentMRREvaluator()
map_evaluator = DocumentMAPEvaluator()
recall = DocumentRecallEvaluator()
ndcg = DocumentNDCGEvaluator()
evaluator.add_component("mrr_evaluator", mrr_evaluator)
evaluator.add_component("map_evaluator", map_evaluator)
evaluator.add_component("recall_evaluator", recall)
evaluator.add_component("ndcg_evaluator", ndcg)
score = evaluator.run({
    "mrr_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
    "map_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
    "recall_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth},
    "ndcg_evaluator": {"retrieved_documents": retrieval_results_list, "ground_truth_documents": grounds_truth}
}
)
print(f"Score for model {model}: ")
print(f"MRR: {score['mrr_evaluator']['score']}")
print(f"MAP: {score['map_evaluator']['score']}")
print(f"Recall: {score['recall_evaluator']['score']}")
print(f"NDCG: {score['ndcg_evaluator']['score']}")