Let us use OpenAIClient and `text-embedding-3-small` as an example for embedder

In [10]:
from lightrag.core.embedder import Embedder
from lightrag.components.model_client import OpenAIClient
from lightrag.utils import setup_env # ensure you setup OPENAI_API_KEY in your project .env file

model_kwargs = {
    "model": "text-embedding-3-small",
    "dimensions": 256,
    "encoding_format": "float",
}

query = "What is the capital of China?"

queries = [query] * 100


embedder = Embedder(model_client=OpenAIClient(), model_kwargs=model_kwargs)

In [11]:
print(embedder)

Embedder(
  model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
  (model_client): OpenAIClient()
)


In [12]:
# call the embedder with a single query, takes around 0.3 seconds for one query
response = embedder(query)
print(response)

EmbedderOutput(data=[Embedding(embedding=[0.13630725, -0.06240097, 0.1181271, 0.14570473, -0.047997367, -0.059941817, -0.043342546, 0.104513936, -0.00540684, -0.13902989, -0.011669992, 0.10284523, -0.11768796, 0.012581195, -0.0035267966, 0.026523706, -0.10460176, -0.029663514, 0.06472838, -0.017576346, -0.0500613, 0.00921084, -0.0726328, 0.03102483, 0.093623415, -0.004448978, -0.05199349, -0.019179186, 0.016709056, 0.021199204, 0.01762026, -0.045933437, -0.017225038, 0.042596016, -0.0376338, -0.016731013, -0.055682216, -0.05945877, 0.007322563, -0.08923207, 0.06125922, 0.030234389, 0.005434286, 0.058712244, -0.07070061, -0.10548004, -0.033747464, 0.008129472, -0.018805923, 0.0500613, 0.025513697, 0.006411359, 0.12313323, 0.07289628, 0.07781458, 0.016160138, 0.03431834, 0.018904727, 0.12445063, -0.0002830357, 0.0319031, 0.011461402, -0.0036612814, 0.07311584, -0.01955245, -0.0028873074, 0.04786563, 0.057570495, 0.063191414, -0.04116883, -0.0181582, 0.013152069, 0.022395844, 0.043364502,

In [13]:
# dimension
print(response.embedding_dim, response.length, response.is_normalized)

256 1 True


Use batch processing. It can handle 10-100 embeddings at a time, depending on the model. When we are using cloud API provider like OpenAI, we do not have clarity on how they are processing the backend or how this will impact the cost. Also, each API provider might have different pricing models.

In [14]:
# call the embedder with a list of queries, takes around 0.9 seconds fpr 100 queries, 2.5s for 1000 queries
response = embedder(queries)
print(response)

EmbedderOutput(data=[Embedding(embedding=[0.13626286, -0.062420864, 0.11798905, 0.14583904, -0.04796874, -0.059960928, -0.04344422, 0.104459405, -0.0053564, -0.1390742, -0.011739603, 0.10287802, -0.117637634, 0.012508333, -0.0035361573, 0.026510198, -0.10463511, -0.0297169, 0.064792946, -0.017559987, -0.05007726, 0.009197304, -0.072568096, 0.031166505, 0.093565404, -0.0043845056, -0.05192221, -0.019174319, 0.01669242, 0.021184, 0.017592931, -0.04594808, -0.017219549, 0.042653523, -0.037601873, -0.0167034, -0.05569997, -0.059477728, 0.0072919517, -0.08917266, 0.061322678, 0.030178137, 0.0054168, 0.05877489, -0.07067922, -0.105425805, -0.033890005, 0.008126573, -0.01876799, 0.05007726, 0.025477903, 0.006446349, 0.123084635, 0.07274381, 0.07783939, 0.0162202, 0.03432928, 0.018954681, 0.124402456, -0.00026871226, 0.03191327, 0.01136622, -0.003593812, 0.073051296, -0.019602612, -0.0028799914, 0.047880888, 0.05754492, 0.06321156, -0.04113803, -0.018262824, 0.013189208, 0.022402985, 0.0433783

In [15]:
response.length, response.embedding_dim, response.is_normalized

(100, 256, True)

Use local model, we use ``TransformersClient`` as an example. And we will enable our library logging to see the process better.

In [16]:
from lightrag.core.embedder import Embedder
from lightrag.components.model_client import TransformersClient
# from lightrag.utils import enable_library_logging

# enable_library_logging(level="DEBUG")

model_kwargs = {"model": "thenlper/gte-base"}
local_embedder = Embedder(model_client=TransformersClient(), model_kwargs=model_kwargs)

In [17]:
print(local_embedder)

Embedder(
  model_kwargs={'model': 'thenlper/gte-base'}, 
  (model_client): TransformersClient()
)


In [18]:
# single query, takes around 0.1 seconds for one query, this might differs on the hardware you use
response = local_embedder(query)
print(response.length)
print(response)

1
EmbedderOutput(data=[Embedding(embedding=[-0.005133533850312233, 0.0028610897716134787, -0.005318048410117626, 0.003371045459061861, 0.02472378872334957, 0.03112250752747059, 0.015265258029103279, 0.07056713849306107, 0.003583855228498578, -0.04136492684483528, 0.0050127278082072735, -0.05716497078537941, -0.055627889931201935, 0.015120620839297771, -0.018120605498552322, 0.03411967307329178, 0.04766490310430527, -0.00480597373098135, -0.006379017140716314, 0.020822355523705482, -0.02577640861272812, 0.006425780709832907, 0.02500789798796177, 0.05046386271715164, 0.032914094626903534, -0.009098694659769535, -0.01514158770442009, 0.009841611608862877, -0.09457813203334808, -0.013661234639585018, -0.0033976587001234293, 0.026431478559970856, -0.005364100448787212, 0.011934318579733372, -0.004608132876455784, -0.008419801481068134, -0.023735282942652702, -0.05674963817000389, -0.006277650129050016, 0.01179672870784998, -0.022550148889422417, -0.031651075929403305, -0.027354741469025612,

In [19]:
# multiple queries, takes around 0.7s for 100 queries
response = local_embedder(queries)

In [20]:
print(response.length, response.embedding_dim)

100 768


In [21]:
print(response)

EmbedderOutput(data=[Embedding(embedding=[-0.005133533850312233, 0.0028610897716134787, -0.005318048410117626, 0.003371045459061861, 0.02472378872334957, 0.03112250752747059, 0.015265258029103279, 0.07056713849306107, 0.003583855228498578, -0.04136492684483528, 0.0050127278082072735, -0.05716497078537941, -0.055627889931201935, 0.015120620839297771, -0.018120605498552322, 0.03411967307329178, 0.04766490310430527, -0.00480597373098135, -0.006379017140716314, 0.020822355523705482, -0.02577640861272812, 0.006425780709832907, 0.02500789798796177, 0.05046386271715164, 0.032914094626903534, -0.009098694659769535, -0.01514158770442009, 0.009841611608862877, -0.09457813203334808, -0.013661234639585018, -0.0033976587001234293, 0.026431478559970856, -0.005364100448787212, 0.011934318579733372, -0.004608132876455784, -0.008419801481068134, -0.023735282942652702, -0.05674963817000389, -0.006277650129050016, 0.01179672870784998, -0.022550148889422417, -0.031651075929403305, -0.027354741469025612, 0

In [22]:
response.data[1]

Embedding(embedding=[-0.005133533850312233, 0.0028610897716134787, -0.005318048410117626, 0.003371045459061861, 0.02472378872334957, 0.03112250752747059, 0.015265258029103279, 0.07056713849306107, 0.003583855228498578, -0.04136492684483528, 0.0050127278082072735, -0.05716497078537941, -0.055627889931201935, 0.015120620839297771, -0.018120605498552322, 0.03411967307329178, 0.04766490310430527, -0.00480597373098135, -0.006379017140716314, 0.020822355523705482, -0.02577640861272812, 0.006425780709832907, 0.02500789798796177, 0.05046386271715164, 0.032914094626903534, -0.009098694659769535, -0.01514158770442009, 0.009841611608862877, -0.09457813203334808, -0.013661234639585018, -0.0033976587001234293, 0.026431478559970856, -0.005364100448787212, 0.011934318579733372, -0.004608132876455784, -0.008419801481068134, -0.023735282942652702, -0.05674963817000389, -0.006277650129050016, 0.01179672870784998, -0.022550148889422417, -0.031651075929403305, -0.027354741469025612, 0.0013332983944565058,

It is a good practise to set up a maximum ``batch_size`` before calling the ``Embedder``.

In [23]:
from tqdm import tqdm

batch_size = 100
all_queries = [query] * 1024

for i in tqdm(range(0, len(all_queries), batch_size)):
    print(f"Processing batch {i // batch_size}")
    response = local_embedder(all_queries[i : i + batch_size])
    print(response.length)

  0%|          | 0/11 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing batch 0


  9%|▉         | 1/11 [00:00<00:07,  1.35it/s]

100
Processing batch 1


 18%|█▊        | 2/11 [00:01<00:04,  1.90it/s]

100
Processing batch 2


 27%|██▋       | 3/11 [00:01<00:03,  2.28it/s]

100
Processing batch 3


 36%|███▋      | 4/11 [00:01<00:02,  2.40it/s]

100
Processing batch 4


 45%|████▌     | 5/11 [00:02<00:02,  2.49it/s]

100
Processing batch 5


 55%|█████▍    | 6/11 [00:02<00:01,  2.58it/s]

100
Processing batch 6


 64%|██████▎   | 7/11 [00:02<00:01,  2.64it/s]

100
Processing batch 7


 73%|███████▎  | 8/11 [00:03<00:01,  2.70it/s]

100
Processing batch 8


 82%|████████▏ | 9/11 [00:03<00:00,  2.72it/s]

100
Processing batch 9


100%|██████████| 11/11 [00:04<00:00,  2.67it/s]

100
Processing batch 10
24





Use our ``BatchEmbedder`` to handle the batch processing.

In [24]:
from lightrag.core.embedder import BatchEmbedder

batch_embedder = BatchEmbedder(embedder=local_embedder, batch_size=100)

response = batch_embedder(all_queries)

100%|██████████| 11/11 [00:04<00:00,  2.59it/s]


Check if an embedding is normalized and how to normalize it if it is not.
Use post progressor to shrink the dimension of an embedding.

In [29]:
from lightrag.core.types import Embedding
from lightrag.core.functional import normalize_vector
from typing import List
from lightrag.core.component import Component
from copy import deepcopy
class DecreaseEmbeddingDim(Component):
    def __init__(self, old_dim: int, new_dim: int,  normalize: bool = True):
        super().__init__()
        self.old_dim = old_dim
        self.new_dim = new_dim
        self.normalize = normalize
        assert self.new_dim < self.old_dim, "new_dim should be less than old_dim"

    def call(self, input: List[Embedding]) -> List[Embedding]:
        output: List[Embedding] = deepcopy(input)
        for embedding in output:
            old_embedding = embedding.embedding
            new_embedding = old_embedding[: self.new_dim]
            if self.normalize:
                new_embedding = normalize_vector(new_embedding)
            embedding.embedding = new_embedding
        return output
    
    def _extra_repr(self) -> str:
        repr_str = f"old_dim={self.old_dim}, new_dim={self.new_dim}, normalize={self.normalize}"
        return repr_str

Let us decrease the dimension of local embeddings using output_processors in Embedder.

In [30]:
local_embedder_256 = Embedder(
    model_client=TransformersClient(),
    model_kwargs=model_kwargs,
    output_processors=DecreaseEmbeddingDim(768, 256),
)

In [31]:
print(local_embedder_256)

Embedder(
  model_kwargs={'model': 'thenlper/gte-base'}, 
  (model_client): TransformersClient()
  (output_processors): DecreaseEmbeddingDim(old_dim=768, new_dim=256, normalize=True)
)


In [28]:
response = local_embedder_256(query)
print(response.length, response.embedding_dim)

1 256
