In [2]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import Document

# LLM
from llama_index.llms import Anthropic

# Embeddings
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding, CohereEmbedding

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

# Rerankers
from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore
# from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator

from typing import List
import pandas as pd

import nest_asyncio

nest_asyncio.apply()

In [3]:
from datasets import load_dataset

dataset = load_dataset("sciq")

In [4]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)

corpus = []
filtered_queries = []
counter = 0
for train_row in dataset["train"]:
    if len(train_row["support"].strip()) == 0:
        continue
    current_document = Document(text=train_row["support"])
    if len(node_parser.get_nodes_from_documents([current_document])) == 1:
        corpus.append(train_row["support"])
        filtered_queries.append(train_row["question"])
        counter += 1
    if counter == 500:
        break

In [5]:
documents = [Document(text=c) for c in corpus]
nodes = node_parser.get_nodes_from_documents(documents)
for idx, node in enumerate(nodes):
    node.id_ = f"corpus_{idx}"

In [6]:
queries_dict = {f"query_{index}":filtered_queries[index] for index in range(counter)}
corpus_dict = {f"corpus_{index}":corpus[index] for index in range(counter)}
relevant_docs_dict = {f"query_{index}":[f"corpus_{index}"] for index in range(counter)}

In [7]:
qa_dataset = EmbeddingQAFinetuneDataset(
    queries=queries_dict,
    corpus=corpus_dict,
    relevant_docs=relevant_docs_dict
)

In [9]:
# Define all embeddings and rerankers
EMBEDDINGS = {
    "bge-large": HuggingFaceEmbedding(model_name='BAAI/bge-large-en'), # You can use mean pooling by addin pooling='mean' parameter
    "JinaAI-Small": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-small-en', pooling='mean', trust_remote_code=True),
    "JinaAI-Base": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-base-en', pooling='mean', trust_remote_code=True),
}

RERANKERS = {
    "WithoutReranker": "None",
    "bge-reranker-base": SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=5),
    "bge-reranker-large": SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5)
}

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [10]:
def display_results(embedding_name, reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [11]:
results_df = pd.DataFrame()

# Loop over embeddings
for embed_name, embed_model in EMBEDDINGS.items():

    print(f"Running Evaluation for Embedding Model: {embed_name}")

    service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
    vector_index = VectorStoreIndex(nodes, service_context=service_context)

    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

    # Loop over rerankers
    for rerank_name, reranker in RERANKERS.items():

        print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}")

        # Define Retriever
        class CustomRetriever(BaseRetriever):
            """Custom retriever that performs both Vector search and Knowledge Graph search"""

            def __init__(
                self,
                vector_retriever: VectorIndexRetriever,
            ) -> None:
                """Init params."""

                self._vector_retriever = vector_retriever

            def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Retrieve nodes given query."""

                retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

                if reranker != 'None':
                    retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
                else:
                    retrieved_nodes = retrieved_nodes[:5]

                return retrieved_nodes

            async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Asynchronously retrieve nodes given query.

                Implemented by the user.

                """
                return self._retrieve(query_bundle)

            async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
                if isinstance(str_or_query_bundle, str):
                    str_or_query_bundle = QueryBundle(str_or_query_bundle)
                return await self._aretrieve(str_or_query_bundle)

        custom_retriever = CustomRetriever(vector_retriever)

        retriever_evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate"], retriever=custom_retriever
        )
        eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

        current_df = display_results(embed_name, rerank_name, eval_results)
        results_df = pd.concat([results_df, current_df], ignore_index=True)

Running Evaluation for Embedding Model: bge-large
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: bge-large and Reranker: WithoutReranker
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: JinaAI-Small
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: WithoutReranker
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: JinaAI-Base
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: JinaAI-Base and Reranker: WithoutReranker
Running Evaluation for Embedding Model: JinaAI-Base and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: JinaAI-Base

In [12]:
# Display final results
print(results_df)

      Embedding            Reranker  hit_rate       mrr
0     bge-large     WithoutReranker     0.970  0.934400
1     bge-large   bge-reranker-base     0.970  0.938900
2     bge-large  bge-reranker-large     0.970  0.935333
3  JinaAI-Small     WithoutReranker     0.968  0.914100
4  JinaAI-Small   bge-reranker-base     0.968  0.938067
5  JinaAI-Small  bge-reranker-large     0.968  0.937833
6   JinaAI-Base     WithoutReranker     0.972  0.926133
7   JinaAI-Base   bge-reranker-base     0.972  0.941667
8   JinaAI-Base  bge-reranker-large     0.972  0.948000


In [None]:
#       Embedding            Reranker  hit_rate       mrr
# 0     bge-large     WithoutReranker     0.970  0.934400
# 1     bge-large   bge-reranker-base     0.970  0.938900
# 2     bge-large  bge-reranker-large     0.970  0.935333
# 3  JinaAI-Small     WithoutReranker     0.968  0.914100
# 4  JinaAI-Small   bge-reranker-base     0.968  0.938067
# 5  JinaAI-Small  bge-reranker-large     0.968  0.937833
# 6   JinaAI-Base     WithoutReranker     0.972  0.926133
# 7   JinaAI-Base   bge-reranker-base     0.972  0.941667
# 8   JinaAI-Base  bge-reranker-large     0.972  0.948000