In [2]:
from tongyi.embeddings import TongyiEmbeddings

embeddings_model = TongyiEmbeddings()

In [3]:
texts = [
        "你好吗",
        "你的名字是什么",
        "我的肚子好痛啊",
        "肠胃不舒服",
        "我在吃东西"
    ]
embeddings = embeddings_model.embed_documents(texts)

len(embeddings), len(embeddings[0])

(5, 1536)

In [4]:
embeddings[0][:10]

[0.3016113340854645,
 1.5886719226837158,
 0.846020519733429,
 -0.800488293170929,
 -1.6886475086212158,
 0.27827149629592896,
 1.8710448741912842,
 -1.1935546398162842,
 -1.3988037109375,
 0.968017578125]

# cosine similarity

In [22]:
import numpy as np

def normalize(x):
    x = np.asarray(x)
    norms = np.sum(np.multiply(x, x))
    norms = np.sqrt(norms)
    return x / norms

for i in range(5):
    similarity = np.dot(normalize(embeddings[2]), normalize(embeddings[i]))
    print(f'"{texts[2]}"与"{texts[i]}"的语义相似度为：{similarity}')

"我的肚子好痛啊"与"你好吗"的语义相似度为：0.3540708666322656
"我的肚子好痛啊"与"你的名字是什么"的语义相似度为：0.3079039808785484
"我的肚子好痛啊"与"我的肚子好痛啊"的语义相似度为：1.0
"我的肚子好痛啊"与"肠胃不舒服"的语义相似度为：0.418081827009795
"我的肚子好痛啊"与"我在吃东西"的语义相似度为：0.3523671162523911


# cache
## LocalFileStore

In [5]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.model_name
)


In [6]:
%%time
cached_embedder.embed_documents(texts)

CPU times: user 75.1 ms, sys: 6.25 ms, total: 81.4 ms
Wall time: 357 ms


[[0.3016113340854645,
  1.5886719226837158,
  0.846020519733429,
  -0.800488293170929,
  -1.6886475086212158,
  0.27827149629592896,
  1.8710448741912842,
  -1.1935546398162842,
  -1.3988037109375,
  0.968017578125,
  -0.14565429091453552,
  4.130761623382568,
  1.725976586341858,
  -0.2990478575229645,
  -0.4421142637729645,
  -1.0256836414337158,
  -1.7417113780975342,
  2.149707078933716,
  -0.9922240972518921,
  0.7191528081893921,
  -2.818652391433716,
  -2.554882764816284,
  0.3016113340854645,
  -0.23046875,
  2.06585693359375,
  -0.8822571039199829,
  1.17608642578125,
  0.6553955078125,
  -0.052978515625,
  0.16975097358226776,
  -0.915631115436554,
  -4.248022556304932,
  -2.135546922683716,
  0.38872069120407104,
  3.1720213890075684,
  -0.8030151128768921,
  0.27705079317092896,
  2.853131055831909,
  -0.36101073026657104,
  0.42310792207717896,
  1.32318115234375,
  0.8155273199081421,
  -0.4817962646484375,
  -1.1077148914337158,
  1.385009765625,
  -1.5549194812774658,
 

In [7]:
%%time
cached_embedder.embed_documents(texts)

CPU times: user 5.27 ms, sys: 2.35 ms, total: 7.62 ms
Wall time: 6.12 ms


[[0.3016113340854645,
  1.5886719226837158,
  0.846020519733429,
  -0.800488293170929,
  -1.6886475086212158,
  0.27827149629592896,
  1.8710448741912842,
  -1.1935546398162842,
  -1.3988037109375,
  0.968017578125,
  -0.14565429091453552,
  4.130761623382568,
  1.725976586341858,
  -0.2990478575229645,
  -0.4421142637729645,
  -1.0256836414337158,
  -1.7417113780975342,
  2.149707078933716,
  -0.9922240972518921,
  0.7191528081893921,
  -2.818652391433716,
  -2.554882764816284,
  0.3016113340854645,
  -0.23046875,
  2.06585693359375,
  -0.8822571039199829,
  1.17608642578125,
  0.6553955078125,
  -0.052978515625,
  0.16975097358226776,
  -0.915631115436554,
  -4.248022556304932,
  -2.135546922683716,
  0.38872069120407104,
  3.1720213890075684,
  -0.8030151128768921,
  0.27705079317092896,
  2.853131055831909,
  -0.36101073026657104,
  0.42310792207717896,
  1.32318115234375,
  0.8155273199081421,
  -0.4817962646484375,
  -1.1077148914337158,
  1.385009765625,
  -1.5549194812774658,
 

In [8]:
list(store.yield_keys())

['text_embedding_v1f33a10ff-859a-5463-b3ff-f49f9fa5f6fa',
 'text_embedding_v1046ba0f1-f46d-50cb-a4a2-d42b4b0a372b',
 'text_embedding_v1fdcb1804-6409-5e76-89ff-9684747fff9d',
 'text_embedding_v17cd8ea1f-6312-57bd-b0c4-46b1e007af6a',
 'text_embedding_v1c8a6a73e-11f0-59e7-84f4-cb126b59694f']

## InMemoryByteStore

In [None]:
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.model_name
)

cached_embedder.embed_documents

## RedisStore

In [None]:
from langchain.storage import RedisStore

store = RedisStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.model_name
)

cached_embedder.embed_documents