This evaluation utilize the MS Macro v1.1 Question Answering datasets (https://microsoft.github.io/msmarco/) to evaluate the quality of the model, and compare it fairly with OpenAI's text-embedding model.

In each item of the dataset, there are several passages and 1 question, and the model is required to select the most relevant passage to the question. The dataset provides the groundtruth relevant passage(s) for each question, and the model is evaluated by the retrieval precision of dot product of query embedding and passage embeddings.

In [1]:
import json
import os
import random
from typing import Annotated, List
from tqdm import tqdm

import bittensor as bt
import openai
import torch
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel

from datasets import load_dataset
from openkaito.protocol import TextEmbeddingSynapse
from openkaito.utils.embeddings import openai_embeddings_tensor
from openkaito.utils.version import get_version


In [2]:
subtensor_network = "finney"
validator_wallet_name = "openkaito_dev"
validator_hotkey_name = "openkaito_dev_hotkey"
netuid = 5

In [3]:
subtensor = bt.subtensor(network=subtensor_network)
wallet = bt.wallet(name=validator_wallet_name, hotkey=validator_hotkey_name)
metagraph = subtensor.metagraph(netuid=netuid)
metagraph.sync(subtensor=subtensor)
dendrite = bt.dendrite(wallet=wallet)



In [4]:

load_dotenv()

# for ranking results evaluation
llm_client = openai.OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
    organization=os.getenv("OPENAI_ORGANIZATION"),
    project=os.getenv("OPENAI_PROJECT"),
    max_retries=3,
)

In [5]:
ds = load_dataset("microsoft/ms_marco", "v1.1", split="test")

In [7]:
async def get_miners_embeddings(
    miner_uids: List[int],
    texts: List[str],
    dimensions: int = 512,
    timeout: int = 60,
):
    synapse = TextEmbeddingSynapse(
        texts=texts,
        dimensions=dimensions,
        normalized=True,
        timeout=timeout,
        version=get_version(),
    )
    responses = await dendrite(
        axons=[metagraph.axons[uid] for uid in miner_uids],
        synapse=synapse,
        deserialize=True,
        timeout=synapse.timeout,
    )
    return responses


async def get_miner_embeddings(
    miner_uid: int,
    texts: List[str],
    dimensions: int = 512,
    timeout: int = 60,
):
    return (await get_miners_embeddings([miner_uid], texts, dimensions, timeout))[0]

In [8]:
# top miners somehow block most of the requests, need to get around this by composing the requests
def build_miner_texts(query, passages):
    return [query + "?"] * len(passages["is_selected"]) + passages["passage_text"]

In [17]:

total_samples = ds.num_rows
print(f"Total samples: {total_samples}")

for i, row in enumerate(ds):
    print("====================")
    print("query:", row["query"])
    print("passages:", row["passages"])
    if i > 5:
        break

Total samples: 9650
query: does human hair stop squirrels
passages: {'is_selected': [0, 0, 1, 0, 0, 0, 0], 'passage_text': ['We have been feeding our back yard squirrels for the fall and winter and we noticed that a few of them have missing fur. One has a patch missing down his back and under both arms. Also another has some missing on his whole chest. They are all eating and seem to have a good appetite.', 'Critters cannot stand the smell of human hair, so sprinkling a barrier of hair clippings around your garden, or lightly working it into the soil when you plant bulbs, apparently does have some merit. The whole thing kind of makes me laugh. It never occurred to me that we are the ones that stink.', "Spread some human hair around your vegetable and flower gardens. This will scare the squirrels away because humans are predators of squirrels. It is better if the hair hasn't been washed so the squirrels will easily pick up the human scent.", '1 You can sprinkle blood meal around your ga

In [9]:
metagraph.sync(subtensor=subtensor)

In [10]:
def topk_incentive_uids(metagraph, k: int) -> List[int]:
    miners_uids = metagraph.uids.tolist()

    # Builds a dictionary of uids and their corresponding incentives
    all_miners_incentives = {
        "miners_uids": miners_uids,
        "incentives": list(map(lambda uid: metagraph.I[uid], miners_uids)),
    }

    # Zip the uids and their corresponding incentives into a list of tuples
    uid_incentive_pairs = list(
        zip(all_miners_incentives["miners_uids"], all_miners_incentives["incentives"])
    )

    # Sort the list of tuples by the incentive value in descending order
    uid_incentive_pairs_sorted = sorted(
        uid_incentive_pairs, key=lambda x: x[1], reverse=True
    )

    logger.info(f"Top {k} uids with highest incentives: {uid_incentive_pairs_sorted}")
    top_k_uids = [uid for uid, incentive in uid_incentive_pairs_sorted[:k]]

    return top_k_uids

In [11]:
top_miner_uid = topk_incentive_uids(metagraph, 1)[0]
print(f"Top miner uid: {top_miner_uid}")

2024-10-29 13:16:23.819 | INFO     | __main__:topk_incentive_uids:20 - Top 1 uids with highest incentives: [(49, 0.00483711), (181, 0.004806592), (55, 0.004791333), (35, 0.004745556), (17, 0.004715038), (58, 0.0046997787), (233, 0.0046997787), (56, 0.00468452), (91, 0.00468452), (150, 0.00468452), (229, 0.00468452), (66, 0.0046692607), (78, 0.0046692607), (118, 0.0046692607), (129, 0.0046692607), (185, 0.0046692607), (100, 0.004654002), (126, 0.004654002), (198, 0.004654002), (251, 0.004654002), (30, 0.0046387427), (60, 0.0046387427), (71, 0.0046387427), (112, 0.0046387427), (192, 0.0046387427), (249, 0.0046387427), (133, 0.004623484), (240, 0.004623484), (248, 0.004623484), (74, 0.0046082246), (84, 0.0046082246), (98, 0.0046082246), (99, 0.0046082246), (237, 0.0046082246), (247, 0.0046082246), (19, 0.004592966), (34, 0.004592966), (134, 0.004592966), (147, 0.004592966), (156, 0.004592966), (163, 0.004592966), (167, 0.004592966), (169, 0.004592966), (184, 0.004592966), (190, 0.00459296

Top miner uid: 49


In [21]:
miner_uid = top_miner_uid

miner_precisions = []
miner_recalls = []

openai_precisions = []


report_interval = 20

for i, row in enumerate(ds):
    query = row["query"]
    passages = row["passages"]
    num_passages = len(passages["is_selected"])
    if sum(passages["is_selected"]) == 0:
        logger.trace(f"{i}: Query {query} has no positive passage, skipping")
        continue
    # positive_idx = passages["is_selected"].index(1)
    selected_indices = (
        torch.tensor(passages["is_selected"], dtype=torch.long).nonzero().squeeze()
    )
    # print(selected_indices)
    # print(query)
    # print(passages)
    texts = build_miner_texts(query, passages)
    # print(len(texts))
    miner_embeddings = await get_miner_embeddings(
        miner_uid, texts, dimensions=512, timeout=60
    )
    if not miner_embeddings:
        logger.trace(
            f"{i}: Miner {miner_uid} failed to return embeddings for query: {query}"
        )
        continue
    miner_embeddings = torch.tensor(miner_embeddings)

    miner_query_embeddings = miner_embeddings[0].unsqueeze(0)
    miner_passage_embeddings = miner_embeddings[num_passages:]

    miner_top1_prediction = (
        (miner_query_embeddings @ miner_passage_embeddings.T).argmax().item()
    )
    miner_precision = miner_top1_prediction in selected_indices
    miner_precisions.append(miner_precision)
    # print("miner_precision:", miner_precision)
    # break

    openai_texts = [query + "?"] + passages["passage_text"]
    openai_embeddings = openai_embeddings_tensor(
        llm_client,
        openai_texts,
        dimensions=512,
        model="text-embedding-3-small",
    )
    openai_query_embeddings = openai_embeddings[0].unsqueeze(0)
    openai_passage_embeddings = openai_embeddings[1:]

    openai_top1_prediction = (
        (openai_query_embeddings @ openai_passage_embeddings.T).argmax().item()
    )
    openai_precision = openai_top1_prediction in selected_indices
    openai_precisions.append(openai_precision)

    if i % report_interval == 0:
        print("i:", i)
        print(
            f"Miner avg retrieval precision: {sum(miner_precisions) / len(miner_precisions)}"
        )
        print(
            f"OpenAI avg retrieval precision: {sum(openai_precisions) / len(openai_precisions)}"
        )


i: 0
Miner avg retrieval precision: 1.0
OpenAI avg retrieval precision: 1.0


i: 20
Miner avg retrieval precision: 0.5882352941176471
OpenAI avg retrieval precision: 0.4117647058823529
i: 60
Miner avg retrieval precision: 0.40384615384615385
OpenAI avg retrieval precision: 0.3076923076923077
i: 80
Miner avg retrieval precision: 0.4264705882352941
OpenAI avg retrieval precision: 0.38235294117647056
i: 100
Miner avg retrieval precision: 0.4470588235294118
OpenAI avg retrieval precision: 0.4117647058823529
i: 120
Miner avg retrieval precision: 0.43137254901960786
OpenAI avg retrieval precision: 0.39215686274509803
i: 140
Miner avg retrieval precision: 0.47058823529411764
OpenAI avg retrieval precision: 0.40336134453781514
i: 180
Miner avg retrieval precision: 0.4444444444444444
OpenAI avg retrieval precision: 0.38562091503267976
i: 200
Miner avg retrieval precision: 0.4294117647058823
OpenAI avg retrieval precision: 0.38823529411764707
i: 220
Miner avg retrieval precision: 0.41935483870967744
OpenAI avg retrieval precision: 0.3817204301075269
i: 240
Miner avg retri

In [22]:
print(f"Miner avg retrieval precision: {sum(miner_precisions) / len(miner_precisions)}")
print(
    f"OpenAI avg retrieval precision: {sum(openai_precisions) / len(openai_precisions)}"
)

Miner avg retrieval precision: 0.36318348950302454
OpenAI avg retrieval precision: 0.3470525441821848
