# Overview

This notebook demonstrates the high retrieval quality at small (128 byte) embedding vectors from the `snowflake-arctic-m-v1.5` model.

We begin by loading the full 768-dimensional embeddings in full float32 precision (these precomputed embeddings are made available as a Huggingface dataset). We then demonstrate proper truncation with renormalization to unit norm, plus uniform scalar quantization to int4 datatype.

## Int4 Quantization

TODO: Add equations

TODO: Add explation

In [1]:
!pip install -q numpy pandas torch numba pytrec-eval pyarrow tqdm huggingface-hub

In [2]:
import json
import logging
from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
from pathlib import Path
from typing import cast

import huggingface_hub
import numba
import numpy as np
import pandas as pd
import pytrec_eval
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn.functional as F
from numpy.typing import NDArray
from tqdm.auto import tqdm

In [3]:
# Global config.
EMBEDDINGS_DATASET_ID = "Snowflake/mteb-retrieval-snowflake-arctic-embed-m-v1.5"
COLUMN_DOC_ID = "DOC_ID"
COLUMN_QUERY_ID = "QUERY_ID"
COLUMN_VECTOR = "VECTOR_MAIN"
SCALAR_QUANTIZATION_LIMIT = 0.18
TRUNCATION_DIM = 256

In [4]:
# Utility functions.


#### BEGIN LOADING EMBEDDINGS ####

def load_embeddings(
    pq_paths: list[Path],
    id_column_name: str,
    vector_column_name: str = COLUMN_VECTOR,
    truncate_dim: int | None = None,
    num_read_threads: int = 10,
) -> tuple[list[str], NDArray[np.float32]]:
    total_rows = _total_rows(pq_paths)
    vector_chunks = []
    ids = []
    with tqdm(total=total_rows, unit="row", desc="Loading embeddings from disk") as pbar, ThreadPool(num_read_threads) as pool:
        table_iter = pool.imap(pq.read_table, pq_paths)
        for table in table_iter:
            id_chunk = table[id_column_name].to_pylist()
            vector_chunk = _pa_vector_column_to_np_matrix(table[vector_column_name])
            if truncate_dim is not None:
                vector_chunk = truncate_embeddings(vector_chunk, truncate_dim)
            assert len(id_chunk) == vector_chunk.shape[0]
            ids.extend(id_chunk)
            vector_chunks.append(vector_chunk)
            pbar.update(len(id_chunk))
    return ids, np.row_stack(vector_chunks)

def _pa_vector_column_to_np_matrix(pa_array: pa.ChunkedArray) -> NDArray[np.float32]:
    embed_dim = len(pa_array[0])
    res = pa_array.combine_chunks().flatten().to_numpy().reshape(-1, embed_dim)
    return cast(NDArray[np.float32], res)


def _normalize_embeddings(embedings_matrix: NDArray[np.float32]) -> NDArray[np.float32]:
    """Normalize embeddings to unit norm along axis 1."""
    return cast(
        NDArray[np.float32], F.normalize(torch.tensor(embedings_matrix), dim=1).numpy()
    )

def truncate_embeddings(embedings_matrix: NDArray[np.float32], dim: int) -> NDArray[np.float32]:
    """Truncate and renomalize embeddings to lower dimensionality."""
    assert dim <= embedings_matrix.shape[1]
    return _normalize_embeddings(embedings_matrix[:, :dim])

def _total_rows(pq_paths: list[Path]) -> int:
    total = 0
    for p in pq_paths:
        with pq.ParquetFile(p) as pqf:
            total += pqf.metadata.num_rows
    return total


#### BEGIN 4BIT QUANTIZATION ####

@numba.njit(error_model="numpy", parallel=True)
def fast_4bit_uniform_scalar_quantize(
    emb_matrix: NDArray[np.float32], limit: float
) -> NDArray[np.uint8]:
    num_row, num_col = emb_matrix.shape
    assert num_col % 2 == 0
    assert limit > 0
    out = np.empty((num_row, num_col // 2), dtype=np.uint8)
    bin_width = 2 * limit / 15
    for i in numba.prange(num_row):
        row = emb_matrix[i, :]
        for out_j in range(num_col // 2):
            # Pull out two values at a time.
            in_j = out_j * 2
            value1 = row[in_j]
            value2 = row[in_j + 1]

            # 4-bit quantize the values.
            value1 = round(max(0, min(2 * limit, limit + value1)) / bin_width)
            value2 = round(max(0, min(2 * limit, limit + value2)) / bin_width)

            # Pack the values into a single uint8.
            value_packed = (value1 << 4) | value2
            out[i, out_j] = value_packed

    return out

def uint8_matmul(a: NDArray[np.uint8], b: NDArray[np.uint8], out=None) -> NDArray[np.int32]:
    """
    NOTE: A direct `a @ b` will cause integer overflow in datatype uint8.
    NOTE: `np.matmul(a, b, dtype=np.int32)` was ~4x slower than the `np.einsum` version on my machine.
    """
    n, d = a.shape
    d2, m = b.shape
    assert d2 == d
    # return np.matmul(a, b, dtype=np.int32)  # SLOW!
    return np.einsum("ik, kj -> ij", a, b, dtype=np.int32, out=out)

@numba.njit(error_model="numpy", parallel=True)
def parallel_uint4_as_uint8_matmul(a: NDArray[np.uint8], b: NDArray[np.uint8]) -> NDArray[np.int32]:
    """Optimized multi-threaded implementation of matmul between uint4-stored-in-uint8 values.

    NOTE: Loop order optimized for C-layout `a` and F-layout `b` (i.e. `a @ b.T` on C-layout `a` and `b`).
    """
    n, d = a.shape
    d2, m = b.shape
    assert d2 == d
    assert (a < 16).all(), "Large value will trigger multiplication overlfow"
    assert (b < 16).all(), "Large value will trigger multiplication overlfow"
    out = np.zeros((n, m), dtype=np.int32)
    for i in numba.prange(n):
        for j in range(m):
            out[i, j] = np.sum(a[i, :] * b[:, j], dtype=np.int32)
    return out


def uint8_to_float_blas_matmul(a: NDArray[np.uint8], b: NDArray[np.uint8]) -> NDArray[np.int32]:
    """It's hard to beat the incredible optimizations built into BLAS-based f32 matmul,
    so just type-converting to do uint8 matmul is actually pretty darn fast.
    """
    return (a.astype(np.float32) @ b.astype(np.float32)).astype(np.int32)


def unpack_4bit_to_8bit(x: NDArray[np.uint8]) -> NDArray[np.uint8]:
    num_row, num_col_half = x.shape
    out = np.empty((num_row, 2 * num_col_half), dtype=np.uint8)
    out[:, 0::2] = np.right_shift(x, np.uint8(4))
    out[:, 1::2] = np.bitwise_and(x, np.uint8(0b1111))
    return out


def multi_query_4bit_dotproduct(a: NDArray[np.uint8], b: NDArray[np.uint8], limit: float) -> NDArray[np.float32]:
    n, packed_dim = a.shape
    m, packed_dim_2 = b.shape
    assert packed_dim == packed_dim_2
    unpacked_dim = 2 * packed_dim
    bin_width = 2 * limit / 15
    a, b = unpack_4bit_to_8bit(a), unpack_4bit_to_8bit(b)
    sum_of_sums = (np.sum(a, axis=1, dtype=np.int32)[:, None] + np.sum(b, axis=1, dtype=np.int32)[None, :]).astype(np.float32)
    sum_of_prods = uint8_to_float_blas_matmul(a, b.T).astype(np.float32)
    out = np.empty((n, m), dtype=np.float32)
    out[:] = limit * limit * unpacked_dim
    out += -limit * bin_width * sum_of_sums
    out += bin_width * bin_width * sum_of_prods
    return out


@numba.njit(error_model="numpy", parallel=True)
def fast_single_query_4bit_dotproduct(
    query_vector_4bit: NDArray[np.uint8], doc_matrix_4bit: NDArray[np.uint8], limit: float
) -> NDArray[np.float32]:
    num_row, num_col = doc_matrix_4bit.shape
    assert query_vector_4bit.shape == (num_col,)
    assert num_col % 2 == 0
    assert limit > 0
    out = np.empty(num_row, dtype=np.float32)
    bin_width = 2 * limit / 15
    for i in numba.prange(num_row):
        row = doc_matrix_4bit[i, :]
        sum_of_sums = np.int64(0)
        sum_of_prods = np.int64(0)
        for j in range(num_col):
            # Unpack the values from this byte.
            query_value = query_vector_4bit[j]
            doc_value = row[j]
            qv1 = np.right_shift(query_value, np.uint8(4))
            dv1 = np.right_shift(doc_value, np.uint8(4))
            qv2 = np.bitwise_and(query_value, np.uint8(0b1111))
            dv2 = np.bitwise_and(doc_value, np.uint8(0b1111))

            # Accumulate running statistics.
            sum_of_sums += qv1 + dv1 + qv2 + dv2
            sum_of_prods += qv1 * dv1 + qv2 * dv2
        out[i] = (
            limit * limit * 2 * num_col
            - limit * bin_width * sum_of_sums
            + bin_width * bin_width * sum_of_prods
        )

    return out

@numba.njit(parallel=True, error_model="numpy", fastmath=True)
def fast_multi_query_4bit_dotproduct(query_emb_quant, doc_emb_quant, limit):
    num_query, num_col = query_emb_quant.shape
    num_doc, num_col2 = doc_emb_quant.shape
    assert num_col == num_col2
    assert num_col % 2 == 0
    assert limit > 0
    out = np.empty((num_query, num_doc), dtype=np.float32)
    bin_width = 2 * limit / 15
    for q in numba.prange(num_query):
        query_vec = query_emb_quant[q, :]
        for i in range(num_doc):
            doc_vec = doc_emb_quant[i, :]
            sum_of_sums = np.uint32(0)
            sum_of_prods = np.uint32(0)
            for j in range(num_col):
                # Unpack the values from this byte.
                query_value = query_vec[j]
                doc_value = doc_vec[j]
                qv1 = np.right_shift(query_value, np.uint8(4))
                dv1 = np.right_shift(doc_value, np.uint8(4))
                qv2 = np.bitwise_and(query_value, np.uint8(0b1111))
                dv2 = np.bitwise_and(doc_value, np.uint8(0b1111))
    
                # Accumulate running statistics.
                sum_of_sums += qv1 + dv1 + qv2 + dv2
                sum_of_prods += qv1 * dv1 + qv2 * dv2

            # Convert from integer statistics back to floating point.
            out[q, i] = (
                limit * limit * 2 * num_col
                - limit * bin_width * sum_of_sums
                + bin_width * bin_width * sum_of_prods
            )
    return out


#### BEGIN RETRIEVAL AND IR EVALUATION ####

@numba.njit(error_model="numpy", parallel=True)
def sorted_indices_and_scores(scores: NDArray[np.float32], depth: int) -> tuple[NDArray[np.int64], NDArray[np.float32]]:
    idx_argpartition = np.argpartition(scores, -depth, axis=1)
    topk_indices_slice = idx_argpartition[:, -depth:]
    topk_scores = np.take_along_axis(scores_slice, topk_indices_slice, axis=1)
    idx_argsort = np.argsort(-topk_scores)
    topk_indices_sorted = np.take_along_axis(topk_indices_slice, idx_argsort, axis=1)
    topk_scores_sorted = np.take_along_axis(topk_scores, idx_argsort, axis=1)
    return topk_indices_sorted, topk_scores_sorted


def dense_retrieval(
    query_ids: list[str],
    doc_ids: list[str],
    query_embeddings: NDArray[np.float32],
    doc_embeddings: NDArray[np.float32],
    retrieval_depth: int,
    quantize_4bit_with_limit: float | None = None,
    batch_size: int = 256,
) -> dict[str, dict[str, float]]:
    """Perform dense retrieval with a set of ids and embeddings to get query results."""
    if quantize_4bit_with_limit is not None:
        query_embeddings = fast_4bit_uniform_scalar_quantize(query_embeddings, quantize_4bit_with_limit)
        doc_embeddings = fast_4bit_uniform_scalar_quantize(doc_embeddings, quantize_4bit_with_limit)
    
    query_results = {}
    num_queries, num_docs = query_embeddings.shape[0], doc_embeddings.shape[0]
    retrieval_depth = min(retrieval_depth, num_docs)

    batch_slices = [slice(start_i, start_i + batch_size) for start_i in range(0, num_queries, batch_size)]
    with tqdm(total=num_queries, desc="dense retrieval", unit="query") as pbar:
        for batch_slice in batch_slices:
            q_emb_slice = query_embeddings[batch_slice]
            if quantize_4bit_with_limit is None:
                scores_slice = q_emb_slice @ doc_embeddings.T
            else:
                scores_slice = fast_multi_query_4bit_dotproduct(q_emb_slice, doc_embeddings, quantize_4bit_with_limit)
                
            # Get indices and values of top-k scores.
            topk = torch.topk(torch.tensor(scores_slice), retrieval_depth)
            topk_indices_sorted = topk.indices.numpy()
            topk_scores_sorted = topk.values.numpy()
    
            # Convert each set of scores in the slice to a top-k dictionary.
            query_ids_slice = query_ids[batch_slice]
            for slice_offset in range(scores_slice.shape[0]):
                # Populate the results dictionary.
                query_id = query_ids_slice[slice_offset]
                sorted_doc_ids = [doc_ids[idx] for idx in topk_indices_sorted[slice_offset]]
                query_results[query_id] = dict(zip(sorted_doc_ids, topk_scores_sorted[slice_offset].tolist()))
            pbar.update(len(query_ids_slice))

    return query_results


def score_ir_results(
    qrels: dict[str, dict[str, int]],
    results: dict[str, dict[str, float]],
    k_values: list[int],
    ignore_identical_ids: bool = True,
    compute_precision: bool = False,
    round_to: int = 5,
) -> dict[str, float]:
    """Adapted from https://github.com/embeddings-benchmark/mteb/blob/dd5d61724e71b2cdba9f9cf7e01fbed1b81cb423/mteb/evaluation/evaluators/RetrievalEvaluator.py#L189  # noqa: E501
    to ensure consistency with the oficial MTEB evaluation script (which itself
    aims for consistency with BEIR).
    """
    if ignore_identical_ids:
        popped = []
        for qid, rels in results.items():
            for pid in list(rels):
                if qid == pid:
                    results[qid].pop(pid)
                    popped.append(pid)
        if len(popped) > 0:
            logging.info(
                f"Ignoring {len(popped):,d} cases where query id matches "
                "document id for consistency with MTEB/BEIR eval. Set "
                "`ignore_identical_ids=False` to disable this behavior."
            )

    ndcg: Dict[str, float] = {}
    recall: Dict[str, float] = {}
    precision: Dict[str, float] = {}

    for k in k_values:
        ndcg[f"nDCG@{k}"] = 0.0
        recall[f"R@{k}"] = 0.0

    ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
    recall_string = "recall." + ",".join([str(k) for k in k_values])
    precision_string = "P." + ",".join([str(k) for k in k_values])
    measures = {ndcg_string, recall_string}
    if compute_precision:
        measures.add(precision_string)
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, measures)
    scores = evaluator.evaluate(results)

    for query_id in scores.keys():
        for k in k_values:
            ndcg[f"nDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]
            recall[f"R@{k}"] += scores[query_id]["recall_" + str(k)]
            if compute_precision:
                precision[f"P@{k}"] += scores[query_id]["P_" + str(k)]

    for k in k_values:
        ndcg[f"nDCG@{k}"] = round(ndcg[f"nDCG@{k}"] / len(scores), round_to)
        recall[f"R@{k}"] = round(recall[f"R@{k}"] / len(scores), round_to)
        if compute_precision:
            precision[f"P@{k}"] = round(precision[f"P@{k}"] / len(scores), round_to)

    return {**ndcg, **recall, **precision}

In [5]:
# Download the precomputed embeddings for MTEB Retrieval.
# NOTE: The full dataset is around ~100GB.

# # Example of downloading a subset of datasets.
# dataset_subset = ["NFCorpus", "FiQA2018"]
# embeddings_dataset_path_str = huggingface_hub.snapshot_download(
#     repo_id=EMBEDDINGS_DATASET_ID,
#     repo_type="dataset",
#     allow_patterns=["_qrels/*"] + [f"{x}/*" for x in dataset_subset],
# )

embeddings_dataset_path_str = huggingface_hub.snapshot_download(
    repo_id=EMBEDDINGS_DATASET_ID, repo_type="dataset"
)
embeddings_dataset_path = Path(embeddings_dataset_path_str)

Fetching 597 files:   0%|          | 0/597 [00:00<?, ?it/s]

In [6]:
emb_dir = embeddings_dataset_path / "FiQA2018" / "embeddings"
doc_emb_file_paths = sorted(emb_dir.glob("documents*.parquet"))
query_emb_file_paths = sorted(emb_dir.glob("queries*.parquet"))
doc_ids, doc_emb = load_embeddings(doc_emb_file_paths, id_column_name=COLUMN_DOC_ID, truncate_dim=TRUNCATION_DIM)
query_ids, query_emb = load_embeddings(query_emb_file_paths, id_column_name=COLUMN_QUERY_ID, truncate_dim=TRUNCATION_DIM)

Loading embeddings from disk:   0%|          | 0/57638 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/648 [00:00<?, ?row/s]

In [7]:
%%time
scores = dense_retrieval(query_ids, doc_ids, query_emb, doc_emb, retrieval_depth=10)

dense retrieval:   0%|          | 0/648 [00:00<?, ?query/s]

CPU times: user 1.13 s, sys: 99.8 ms, total: 1.23 s
Wall time: 139 ms


In [8]:
%%time
# NOTE: This code isn't super fast because even with our `numba` "fast" implementation above, our code for int4 matmuls is
# much much less optimized than standard float32 matmul code behind non-int4-quantized dense retrieval.
scores_quant = dense_retrieval(
    query_ids,
    doc_ids,
    query_emb,
    doc_emb,
    retrieval_depth=10,
    quantize_4bit_with_limit=SCALAR_QUANTIZATION_LIMIT,
)

dense retrieval:   0%|          | 0/648 [00:00<?, ?query/s]

CPU times: user 7.37 s, sys: 284 ms, total: 7.65 s
Wall time: 1.77 s


In [9]:
def load_mteb_qrels(task_name: str) -> dict:
    path = embeddings_dataset_path / "_qrels" / f"{task_name}.json"
    return json.loads(path.read_text())

qrel = load_mteb_qrels("FiQA2018")

In [10]:
score_unquant = score_ir_results(qrel, scores, k_values=[10])["nDCG@10"]
score_quant = score_ir_results(qrel, scores_quant, k_values=[10])["nDCG@10"]
score_unquant, score_quant

(0.41671, 0.41258)

# Single-query example

For single vectors, even our `numba`-based int4 dotproduct implementation feels pretty fast (scanning 500k documents in 10ms!).

In [11]:
q_vec = query_emb[0]

In [12]:
%%timeit
dotproduct_scores = q_vec[None, :] @ doc_emb.T

820 µs ± 112 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
q_vec_quant = fast_4bit_uniform_scalar_quantize(q_vec[None, :], SCALAR_QUANTIZATION_LIMIT)[0]
doc_emb_quant = fast_4bit_uniform_scalar_quantize(doc_emb, SCALAR_QUANTIZATION_LIMIT)

In [14]:
%%timeit
dotproduct_scores_quant = fast_single_query_4bit_dotproduct(q_vec_quant, doc_emb_quant, SCALAR_QUANTIZATION_LIMIT)

1.41 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
unquant_scores = q_vec @ doc_emb.T
quant_scores = fast_single_query_4bit_dotproduct(q_vec_quant.squeeze(), doc_emb_quant, SCALAR_QUANTIZATION_LIMIT)
relative_error = np.abs(unquant_scores - quant_scores) / unquant_scores
print(f"Relative error μ ± σ: {relative_error.mean():.2%} ± {relative_error.std():.2%}")

Relative error μ ± σ: 3.64% ± 111.36%


# Score all the datasets

Below we provide a reproducible implementation of int4 compressed retrieval quality scoring to show how `snowflake-arctic-embed-m-v1.5` is capable of achieving a 53.7 MTEB Retrieval score in just 128 bytes per vector. 

In [16]:
names = [p.parent.name for p in sorted(embeddings_dataset_path.glob("*/embeddings"))]
print(f"Scoring: {names}")
ndcg10_scores_unquantized = {}
ndcg10_scores_quantized = {}
for name in tqdm(names):
    print(name)
    emb_dir = embeddings_dataset_path / name / "embeddings"
    doc_emb_file_paths = sorted(emb_dir.glob("documents*.parquet"))
    query_emb_file_paths = sorted(emb_dir.glob("queries*.parquet"))
    doc_ids, doc_emb = load_embeddings(doc_emb_file_paths, id_column_name=COLUMN_DOC_ID, truncate_dim=TRUNCATION_DIM)
    query_ids, query_emb = load_embeddings(query_emb_file_paths, id_column_name=COLUMN_QUERY_ID, truncate_dim=TRUNCATION_DIM)
    qrel = load_mteb_qrels(name)
    scores = dense_retrieval(query_ids, doc_ids, query_emb, doc_emb, 10)
    scores_quant = dense_retrieval(query_ids, doc_ids, query_emb, doc_emb, 10, SCALAR_QUANTIZATION_LIMIT)
    ndcg10_scores_unquantized[name] = score_ir_results(qrel, scores, k_values=[10])["nDCG@10"]
    ndcg10_scores_quantized[name] = score_ir_results(qrel, scores_quant, k_values=[10])["nDCG@10"]

Scoring: ['ArguAna', 'CQADupstackAndroidRetrieval', 'CQADupstackEnglishRetrieval', 'CQADupstackGamingRetrieval', 'CQADupstackGisRetrieval', 'CQADupstackMathematicaRetrieval', 'CQADupstackPhysicsRetrieval', 'CQADupstackProgrammersRetrieval', 'CQADupstackStatsRetrieval', 'CQADupstackTexRetrieval', 'CQADupstackUnixRetrieval', 'CQADupstackWebmastersRetrieval', 'CQADupstackWordpressRetrieval', 'ClimateFEVER', 'DBPedia', 'FEVER', 'FiQA2018', 'HotpotQA', 'MSMARCO', 'NFCorpus', 'NQ', 'QuoraRetrieval', 'SCIDOCS', 'SciFact', 'TRECCOVID', 'Touche2020']


  0%|          | 0/26 [00:00<?, ?it/s]

ArguAna


Loading embeddings from disk:   0%|          | 0/8674 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1406 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1406 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1406 [00:00<?, ?query/s]

CQADupstackAndroidRetrieval


Loading embeddings from disk:   0%|          | 0/22998 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/699 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/699 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/699 [00:00<?, ?query/s]

CQADupstackEnglishRetrieval


Loading embeddings from disk:   0%|          | 0/40221 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1570 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1570 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1570 [00:00<?, ?query/s]

CQADupstackGamingRetrieval


Loading embeddings from disk:   0%|          | 0/45301 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1595 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1595 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1595 [00:00<?, ?query/s]

CQADupstackGisRetrieval


Loading embeddings from disk:   0%|          | 0/37637 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/885 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/885 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/885 [00:00<?, ?query/s]

CQADupstackMathematicaRetrieval


Loading embeddings from disk:   0%|          | 0/16705 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/804 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/804 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/804 [00:00<?, ?query/s]

CQADupstackPhysicsRetrieval


Loading embeddings from disk:   0%|          | 0/38316 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1039 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1039 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1039 [00:00<?, ?query/s]

CQADupstackProgrammersRetrieval


Loading embeddings from disk:   0%|          | 0/32176 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/876 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/876 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/876 [00:00<?, ?query/s]

CQADupstackStatsRetrieval


Loading embeddings from disk:   0%|          | 0/42269 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/652 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/652 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/652 [00:00<?, ?query/s]

CQADupstackTexRetrieval


Loading embeddings from disk:   0%|          | 0/68184 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/2906 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/2906 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/2906 [00:00<?, ?query/s]

CQADupstackUnixRetrieval


Loading embeddings from disk:   0%|          | 0/47382 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1072 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1072 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1072 [00:00<?, ?query/s]

CQADupstackWebmastersRetrieval


Loading embeddings from disk:   0%|          | 0/17405 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/506 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/506 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/506 [00:00<?, ?query/s]

CQADupstackWordpressRetrieval


Loading embeddings from disk:   0%|          | 0/48605 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/541 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/541 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/541 [00:00<?, ?query/s]

ClimateFEVER


Loading embeddings from disk:   0%|          | 0/5416593 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1535 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1535 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1535 [00:00<?, ?query/s]

DBPedia


Loading embeddings from disk:   0%|          | 0/4635922 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/400 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/400 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/400 [00:00<?, ?query/s]

FEVER


Loading embeddings from disk:   0%|          | 0/5416568 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/6666 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/6666 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/6666 [00:00<?, ?query/s]

FiQA2018


Loading embeddings from disk:   0%|          | 0/57638 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/648 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/648 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/648 [00:00<?, ?query/s]

HotpotQA


Loading embeddings from disk:   0%|          | 0/5233329 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/7405 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/7405 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/7405 [00:00<?, ?query/s]

MSMARCO


Loading embeddings from disk:   0%|          | 0/8841823 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/6980 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/6980 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/6980 [00:00<?, ?query/s]

NFCorpus


Loading embeddings from disk:   0%|          | 0/3633 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/323 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/323 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/323 [00:00<?, ?query/s]

NQ


Loading embeddings from disk:   0%|          | 0/2681468 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/3452 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/3452 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/3452 [00:00<?, ?query/s]

QuoraRetrieval


Loading embeddings from disk:   0%|          | 0/522931 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/10000 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/10000 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/10000 [00:00<?, ?query/s]

SCIDOCS


Loading embeddings from disk:   0%|          | 0/25657 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/1000 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/1000 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/1000 [00:00<?, ?query/s]

SciFact


Loading embeddings from disk:   0%|          | 0/5183 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/300 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/300 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/300 [00:00<?, ?query/s]

TRECCOVID


Loading embeddings from disk:   0%|          | 0/171332 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/50 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/50 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/50 [00:00<?, ?query/s]

Touche2020


Loading embeddings from disk:   0%|          | 0/382545 [00:00<?, ?row/s]

Loading embeddings from disk:   0%|          | 0/49 [00:00<?, ?row/s]

dense retrieval:   0%|          | 0/49 [00:00<?, ?query/s]

dense retrieval:   0%|          | 0/49 [00:00<?, ?query/s]

In [17]:
df_ndcg10 = pd.DataFrame({"unquantized": ndcg10_scores_unquantized, "quantized": ndcg10_scores_quantized})

# Cache results to CSV.
df_ndcg10.to_csv("ndcgs_validation.csv")

# Roll up CQA Dupstack Retrieval.
is_cqa = df_ndcg10.index.to_series().str.startswith("CQA")
cqa_mean = df_ndcg10.loc[is_cqa].mean().to_frame().T
cqa_mean.index = ["CQADupstackRetrieval"]
df_ndcg10 = pd.concat([df_ndcg10.loc[~is_cqa], cqa_mean]).sort_index()

# Show scores across MTEB retrieval.
df_ndcg10

Unnamed: 0,unquantized,quantized
ArguAna,0.58476,0.57953
CQADupstackRetrieval,0.442101,0.433434
ClimateFEVER,0.36229,0.36063
DBPedia,0.44826,0.437
FEVER,0.87224,0.8659
FiQA2018,0.41671,0.41258
HotpotQA,0.69174,0.68011
MSMARCO,0.41249,0.40598
NFCorpus,0.35799,0.35716
NQ,0.61669,0.61014


In [18]:
# Print mean MTEB Retrieval scores.
df_ndcg10.mean()

unquantized    0.542337
quantized      0.537260
dtype: float64