In [1]:
from fastembed import SparseTextEmbedding, SparseEmbedding
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SparseTextEmbedding.list_supported_models()

[{'model': 'prithivida/Splade_PP_en_v1',
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'additional_files': [],
  'requires_idf': None,
  'vocab_size': 30522},
 {'model': 'prithvida/Splade_PP_en_v1',
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'additional_files': [],
  'requires_idf': None,
  'vocab_size': 30522},
 {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',
  'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',
  'license': 'apache-2.0',
  'size_in_GB': 0.09,
  'additional_f

In [3]:
model_name = "prithvida/Splade_PP_en_v1"
# This triggers the model download
model = SparseTextEmbedding(model_name=model_name)

  model = SparseTextEmbedding(model_name=model_name)
Fetching 5 files: 100%|██████████| 5/5 [01:28<00:00, 17.66s/it]


In [4]:
documents: List[str] = [
    "Chandrayaan-3 is India's third lunar mission",
    "It aimed to land a rover on the Moon's surface - joining the US, China and Russia",
    "The mission is a follow-up to Chandrayaan-2, which had partial success",
    "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)",
    "The estimated cost of the mission is around $35 million",
    "It will carry instruments to study the lunar surface and atmosphere",
    "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023",
    "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.",
    "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit",
    "The mission used GSLV Mk III rocket for its launch",
    "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota",
    "Chandrayaan-3 was launched earlier in the year 2023",
]

In [5]:
sparse_embeddings_list: List[SparseEmbedding] = list(
    model.embed(documents, batch_size=6)
) 

In [6]:
index = 0
sparse_embeddings_list[index]

SparseEmbedding(values=array([0.0529714 , 0.0196357 , 0.36459896, 1.38508725, 0.71776628,
       0.12668033, 0.46230948, 0.4467729 , 0.26897642, 1.01519895,
       1.565534  , 0.29412517, 1.53102434, 0.59785426, 1.10018313,
       0.02079809, 0.09955856, 0.44249222, 0.09747887, 1.53519821,
       1.36765611, 0.1574067 , 0.49882376, 0.38629326, 0.7661289 ,
       1.25804996, 0.39058331, 0.27236342, 0.45152119, 0.48261827,
       0.26084986, 1.35912728, 0.70710504, 1.71639669]), indices=array([ 1010,  1011,  1016,  1017,  2001,  2018,  2034,  2093,  2117,
        2319,  2353,  2509,  2634,  2686,  2796,  2817,  2922,  2959,
        3003,  3148,  3260,  3390,  3462,  3523,  3822,  4231,  4316,
        4774,  5590,  5871,  6416, 11926, 12076, 16469]))

In [8]:
for i in range(5):
    print(f"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}")

Token at index 1010 has weight 0.05297140032052994
Token at index 1011 has weight 0.019635701552033424
Token at index 1016 has weight 0.36459895968437195
Token at index 1017 has weight 1.385087251663208
Token at index 2001 has weight 0.717766284942627


In [14]:
import json
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained(SparseTextEmbedding.list_supported_models()[0]["sources"]["hf"])

In [15]:
def get_tokens_and_weights(sparse_embedding, tokenizer):
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    return token_weight_dict

# Test the function with the first SparseEmbedding
print(json.dumps(get_tokens_and_weights(sparse_embeddings_list[index], tokenizer), indent=4))

{
    "chandra": 1.716396689414978,
    "third": 1.565533995628357,
    "##ya": 1.5351982116699219,
    "india": 1.5310243368148804,
    "3": 1.385087251663208,
    "mission": 1.3676561117172241,
    "lunar": 1.3591272830963135,
    "moon": 1.2580499649047852,
    "indian": 1.100183129310608,
    "##an": 1.0151989459991455,
    "3rd": 0.7661288976669312,
    "was": 0.717766284942627,
    "spacecraft": 0.7071050405502319,
    "space": 0.5978542566299438,
    "flight": 0.49882376194000244,
    "satellite": 0.4826182723045349,
    "first": 0.46230947971343994,
    "expedition": 0.4515211880207062,
    "three": 0.44677290320396423,
    "fourth": 0.44249221682548523,
    "vehicle": 0.3905833065509796,
    "iii": 0.3862932622432709,
    "2": 0.36459895968437195,
    "##3": 0.29412516951560974,
    "planet": 0.27236342430114746,
    "second": 0.2689764201641083,
    "missions": 0.26084986329078674,
    "launched": 0.15740670263767242,
    "had": 0.12668032944202423,
    "largest": 0.099558562