Skip to content

Commit

Permalink
fix: Loading of embedddings (#22260)
Browse files Browse the repository at this point in the history
  • Loading branch information
benjackwhite committed May 20, 2024
1 parent f7f65dc commit 33a0757
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions ee/session_recordings/ai/embeddings_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import datetime
import pytz

from typing import Any
from typing import Any, Optional

from abc import ABC, abstractmethod
from prometheus_client import Histogram, Counter
Expand All @@ -23,9 +23,18 @@
only_pageview_urls,
)

# tiktoken.encoding_for_model(model_name) specifies encoder
# model_name = "text-embedding-3-small" for this usecase
encoding = tiktoken.get_encoding("cl100k_base")
_encoding: Optional[tiktoken.Encoding] = None


def get_encoding() -> tiktoken.Encoding:
global _encoding
if not _encoding:
# NOTE: This does an API request so we want to ensure we load it lazily and not at startup
# tiktoken.encoding_for_model(model_name) specifies encoder
# model_name = "text-embedding-3-small" for this usecase
_encoding = tiktoken.get_encoding("cl100k_base")
return _encoding


MAX_TOKENS_FOR_MODEL = 8191

Expand Down Expand Up @@ -194,7 +203,7 @@ def _embed(self, input: str, source_type: str):

def _num_tokens_for_input(self, string: str) -> int:
"""Returns the number of tokens in a text string."""
return len(encoding.encode(string))
return len(get_encoding().encode(string))

def _flush_embeddings_to_clickhouse(self, embeddings: list[dict[str, Any]], source_type: str) -> None:
try:
Expand Down

0 comments on commit 33a0757

Please sign in to comment.