In [1]:
%load_ext autoreload
%autoreload 2

import json
import os
from collections import Counter
from typing import Dict, Iterable

import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from dotenv import load_dotenv
from qdrant_client import QdrantClient, models
from qdrant_sparse_tools import convert_sparse_vector
from retokenize import (
    aggregate_weights_idf,
    reconstruct_bpe,
    stem_words,
)
import sys
from loguru import logger
from tqdm.auto import tqdm
from transformers import AutoTokenizer

In [2]:
load_dotenv()
# Disable debug logging

logger.add(sys.stderr, level="INFO")

canonical_dataset_name = "scifact"
dataset_name = "scifact-bge-m3-sparse-vectors"
col_name = "bge_m3_sparse_vector"
collection_name = f"{dataset_name}-{col_name}-retok"
model_name = "BAAI/bge-m3"

In [3]:
ds = load_dataset(f"nirantk/{dataset_name}", split="corpus")
ds[col_name][0]

'{"39176": 0.1639404296875, "21094": 0.033599853515625, "159958": 0.1788330078125, "119856": 0.1939697265625, "35011": 0.1964111328125, "26866": 0.2216796875, "70": 0.011077880859375, "168698": 0.161865234375, "14135": 0.04254150390625, "78574": 0.1883544921875, "831": 0.051239013671875, "52490": 0.16845703125, "8231": 0.067626953125, "70760": 0.1358642578125, "34754": 0.1903076171875, "136": 0.01042938232421875, "16750": 0.024810791015625, "23": 0.01120758056640625, "123309": 0.1346435546875, "164462": 0.1981201171875, "13315": 0.131591796875, "44954": 0.168701171875, "45755": 0.1553955078125, "92105": 0.1864013671875, "9": 0.01116943359375, "165598": 0.1431884765625, "297": 0.010650634765625, "214706": 0.0733642578125, "3332": 0.016510009765625, "191": 0.01358795166015625, "7154": 0.00965118408203125, "86898": 0.06939697265625, "177": 0.0108184814453125, "594": 0.03509521484375, "16625": 0.197265625, "16": 0.0110626220703125, "944": 0.052734375, "3956": 0.0084228515625, "1492": 0.152

In [4]:
sparse_vectors = [json.loads(x) for x in ds[col_name]]

### Change schema to Qdrant Sparse Vector

1. Create vocab and reverse vocab from Tokenizer corresponding to the model
2. Invert the integer index to token string
3. Make lists from the keys and values of the vocab dictionary

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab = tokenizer.get_vocab()
reverse_vocab = {v: k for k, v in vocab.items()}

In [6]:
raw_vectors = []
for sv in sparse_vectors:
    raw_vectors.append(
        {
            "tokens": [reverse_vocab[int(key)] for key in sv.keys()],
            "weights": list(sv.values()),
        }
    )

In [7]:
ds["text"][0], raw_vectors[0]["tokens"][30:50], raw_vectors[0]["weights"][30:50]

('Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was to term with greater 

## Recombine and Retokenize

In [8]:
reconstruct_bpe(raw_vectors[0]["tokens"][:10])

['Alterations',
 'architecture',
 'cerebral',
 'white',
 'matter',
 'the',
 'developing',
 'human',
 'brain']

In [9]:
from retokenize import aggregate_weights_idf_k_b

In [20]:
# import nltk

# stopwords = nltk.corpus.stopwords.words("english")
# stopwords

In [32]:
rescored_vectors = []
# logger.level("DEBUG")

for source_sparse_vector in tqdm(raw_vectors):
    reconstructed_words = reconstruct_bpe(source_sparse_vector["tokens"])
    # logger.debug(f"Reconstructed words: {reconstructed_words}")
    stemmed_words = stem_words(reconstructed_words)
    # logger.debug(f"Stemmed words: {stemmed_words}")
    aggregated_weights = aggregate_weights_idf_k_b(stemmed_words, source_sparse_vector["tokens"], source_sparse_vector["weights"])
    # logger.debug(f"Aggregated weights: {aggregated_weights}")
    # break
    rescored_vectors.append(aggregated_weights)

  0%|          | 0/5183 [00:00<?, ?it/s]

In [33]:
# Find length of each sparse vector
vector_lengths = [len(sv) for sv in rescored_vectors]

# Percentile of the lengths
np.percentile(vector_lengths, [10, 50, 90])

array([ 57.,  89., 137.])

In [34]:
# len(rescored_vectors), rescored_vectors[0]

## Upload to Qdrant

In [35]:
client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))

def is_empty(client: QdrantClient, collection_name: str) -> bool:
    return client.get_collection(collection_name).points_count == 0


# client.delete_collection(collection_name)

In [36]:
def reset_collection(client: QdrantClient, collection_name: str):
    if client.collection_exists(collection_name):
        client.delete_collection(collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config={},
        sparse_vectors_config={
            col_name: models.SparseVectorParams(
                index=models.SparseIndexParams(on_disk=False)
            )
        },
    )

In [37]:
rescored_vectors[0]

{'ate': 4.696322767553414,
 'line': 2.7630617011650673,
 'develop': 1.8087965004176547,
 'assess': 1.2705755599546529,
 'toward': 0.9271178923354834,
 'effect': 0.7608069946927775,
 'term': 0.6545522287112017,
 '1.8': 0.5849347282303121,
 'ed': 0.47280862491321196,
 'coe': 0.4307592032305683,
 'matter': 0.3953472334192463,
 'greater': 0.36465045085821046,
 'white': 0.33769936572737447,
 'second': 0.3117835186054872,
 'cortic': 0.2889593523727537,
 '1.15': 0.2727685155019938,
 'imag': 0.25562374775164964,
 'infant': 0.2397338007926837,
 'cerebr': 0.22173821353555864,
 'pre1.44': 0.20842428729570625,
 '01': 0.19613843543194556,
 'absolut': 0.18461852642028495,
 'reson': 0.1755500969703133,
 'result': 0.16852661755142245,
 'nonmi': 0.15896924343137298,
 'relat': 0.15270517792085486,
 'alter': 0.14698463541451465,
 '4.44': 0.14069510659227027,
 'brain': 0.1357342488133793,
 'aniso': 0.12791166259258735,
 'anc': 0.12239143077056451,
 '06)': 0.11868991754738041,
 'calcul': 0.1134785090227463

In [38]:
# Make a vocab of all keys in the reweighted sparse vectors
vocab = set()
for sv in rescored_vectors:
    vocab.update(sv.keys())

In [39]:
len(vocab)

58470

In [40]:
# Convert this into a vocab object with each string having an id
vocab = {word: i for i, word in enumerate(vocab)}
invert_vocab = {i: word for word, i in vocab.items()}

In [41]:
# Recompute the reweighted sparse vectors with the new vocab
id_reweighted_sparse_vectors = []
for sv in tqdm(rescored_vectors):
    new_sv = {}
    for word, weight in sv.items():
        new_sv[vocab[word]] = weight
    id_reweighted_sparse_vectors.append(new_sv)

  0%|          | 0/5183 [00:00<?, ?it/s]

In [42]:
def batched(iterable: Iterable, n: int = 1) -> Iterable:
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), n):
        yield iterable[i : i + n]

In [43]:
def make_points(
    reweighted_sparse_vectors: Dict, ds: Dataset
) -> Iterable[models.PointStruct]:
    points = []
    for sv, element in tqdm(zip(reweighted_sparse_vectors, ds)):
        points.append(
            models.PointStruct(
                id=int(element["_id"]),
                vector={col_name: convert_sparse_vector(sv)},
                payload={
                    "text": element["text"],
                    "title": element["title"],
                    "id": element["_id"],
                },
            )
        )
    return points


# next(read_data(id_reweighted_sparse_vectors, ds))
reset_collection(client, collection_name)
points = make_points(id_reweighted_sparse_vectors, ds)
# Run ONCE to upload data, only when collection is empty
for batch in tqdm(batched(points, 100)):
    try:
        client.upload_points(collection_name=collection_name, points=batch)
    except Exception as e:
        print(e)
        pass

0it [00:00, ?it/s]

0it [00:00, ?it/s]

## Queries

In [None]:
test = pd.read_csv(f"../data/{dataset_name}/qrels/test.tsv", sep="\t")
test["query-id"] = test["query-id"].astype(int)

with open(f"../data/{dataset_name}/queries.jsonl") as f:
    queries = [json.loads(line) for line in f]

# Only keep the test set queries
queries = [q for q in queries if int(q["_id"]) in list(test["query-id"])]
len(queries)

In [None]:
tokenizer = Tokenizer.from_pretrained("nirantk/splade-v3-lexical")
tokens = [tokenizer.encode(q["text"]).tokens for q in queries]
tokens = [list(set(t)) for t in tokens]
# tokens = [list(set(t.ids)) for t in tokens]

In [None]:
idx = 50
tokens[idx]

In [None]:
# assign weight to all tokens and create a query vector with tokens and weights as keys
query_vectors = []
for token in tokens:
    query_vector = {}
    query_vector["tokens"] = token
    query_vector["weights"] = [1] * len(token)
    query_vectors.append(query_vector)

In [None]:
# Retokenize all the query tokens
reweighted_query_tokens = []
for qv, text in tqdm(zip(query_vectors, [q["text"] for q in queries])):
    # print(text)
    # print(qv)
    reweighted_query_tokens.append(
        retokenize_sparse_vector(
            source_sparse_vector=qv, text=text, tokenizer=tokenizer
        )
    )

In [None]:
reweighted_query_tokens[idx + 1]

In [None]:
np.percentile([len(t) for t in reweighted_query_tokens], [10, 50, 90])

In [None]:
vocab

In [None]:
# Map the keys back to the original vocab with integer ids
id_reweighted_query_tokens = []
for qv in tqdm(reweighted_query_tokens):
    new_qv = {}
    for word, weight in qv.items():
        try:
            new_qv[vocab[word]] = weight
        except KeyError:
            print(word)
            continue
    id_reweighted_query_tokens.append(new_qv)

In [None]:
qdrant_query_vectors = [
    models.SparseVector(
        indices=qv.keys(),
        values=qv.values(),
    )
    for qv in id_reweighted_query_tokens
]

In [None]:
qdrant_query_vectors[idx]

In [None]:
limit = 10
results = []
for qv in tqdm(qdrant_query_vectors):
    try:
        result = client.search(
            collection_name=collection_name,
            query_vector=models.NamedSparseVector(name=col_name, vector=qv),
            with_payload=True,
            limit=limit,
        )
        results.append(result)
    except Exception as e:
        print(e)
        print(qv)
        results.append(None)

In [None]:
query_ids, doc_ids, ranks, scores = [], [], [], []
for query, result in zip(queries, results):
    query_id = query["_id"]
    result_ids = [str(r.id) for r in result]
    result_scores = [r.score for r in result]
    result_ranks = list(range(len(result)))
    query_ids.extend(len(result) * [query_id])
    doc_ids.extend(result_ids)
    ranks.extend(result_ranks)
    scores.extend(result_scores)
    # print(f"query: {query_id}")
    # print(f"docid: {result_ids}")
    # print(f"rank: {result_ranks}")
    # print(f"score: {result_scores}")

run = {
    "query": [int(q) for q in query_ids],
    "q0": len(query_ids) * ["q0"],
    "docid": doc_ids,
    "rank": ranks,
    "score": scores,
    "system": len(query_ids) * ["splade"],
}

with open("lexical-retokenize-rescore.run.json", "w") as f:
    json.dump(run, f, indent=2)