In [34]:
!docker pull chromadb/chroma

Using default tag: latest
latest: Pulling from chromadb/chroma
Digest: sha256:2b2ecbef838da70b96da28f4d9464597a4312784c8f2aab6e1f7c43c87d4808b
Status: Image is up to date for chromadb/chroma:latest
docker.io/chromadb/chroma:latest


In [35]:
!docker run -p 8000:8000 chromadb/chroma

docker: Error response from daemon: driver failed programming external connectivity on endpoint charming_brattain (56775680fce7f4c9b2c90f48093ca6ea503d7a3be0142de4c4b52fc17f0fd236): Bind for 0.0.0.0:8000 failed: port is already allocated.


In [1]:
%load_ext dotenv
%dotenv

In [25]:
import chromadb
from chromadb.config import Settings
import pandas as pd
import yandexgpt
import os
from time import sleep
from tqdm import tqdm
import ast

# Get embeddings

In [10]:
def get_embeddings(text):
    sleep(0.5)
    return embeddings.text_embedding(text)["embedding"]

embeddings = yandexgpt.Embeddings(os.getenv("YANDEX_GPT_KEY"), 
                                  os.getenv("YANDEX_GPT_EMBEDDINGS_URI")
                                  )
df = pd.read_csv("data/intents.csv")
text_embeddings = []
for txt in tqdm(df["text"]):
    text_embeddings.append(get_embeddings(txt))
df["text_embeddings"] = text_embeddings
df.to_csv("data/intents_with_embeddings.csv", index=False)

100%|██████████| 3815/3815 [39:11<00:00,  1.62it/s]  


In [69]:
df = pd.read_csv("data/messages.csv").dropna(subset=['answer'])
text_embeddings = []
for txt in tqdm(df["text"]):
    text_embeddings.append(get_embeddings(txt))
df["text_embeddings"] = text_embeddings
df.to_csv("data/messages_with_embeddings.csv", index=False)

100%|██████████| 8414/8414 [1:27:00<00:00,  1.61it/s]


# Store in ChromaDB

In [70]:
chroma_client = chromadb.HttpClient(host='localhost', 
                                    port="8000", 
                                    settings=Settings(anonymized_telemetry=False))


In [None]:
# drop collection if its required
# chroma_client.delete_collection("intents")

In [71]:
collection = chroma_client.get_or_create_collection("intents")

In [17]:
df = pd.read_csv('data/intents_with_embeddings.csv')

In [34]:
texts = df["text"].tolist()
text_embeddings = list(map(
    lambda str_arr: ast.literal_eval(str_arr), 
    df["text_embeddings"].tolist()))
ids = df["id"].astype(str).tolist()
answers = df["answer"].tolist()
collection.upsert(
    ids=ids,
    embeddings=text_embeddings,
    metadatas=[{"source": "intents", "text": txt} for txt in texts],
    documents=answers
)


In [72]:
df = pd.read_csv('data/messages_with_embeddings.csv')

In [74]:
texts = df["text"].tolist()
text_embeddings = list(map(
    lambda str_arr: ast.literal_eval(str_arr), 
    df["text_embeddings"].tolist()))
ids = df["messageId"].astype(str).tolist()
answers = df["answer"].tolist()
collection.upsert(
    ids=ids,
    embeddings=text_embeddings,
    metadatas=[{"source": "messages", "text": txt} for txt in texts],
    documents=answers
)


# Test

In [76]:
result = collection.query(
    query_embeddings=[embeddings.text_embedding("Какие фичи у вас есть")["embedding"]],
    n_results=5,
)
result_documents = []
for distance, metadata, document in zip(
        result["distances"][0], result["metadatas"][0], result["documents"][0]
):
    if distance < 1:
        result_documents.append({"text": metadata["text"], "answer": document})
print(result_documents)

[{'text': 'Меня интересуют много вопросов', 'answer': 'Очень хорошо, будет рады ответить на все ваши вопросы'}, {'text': 'Что мне нужно знать, чтобы', 'answer': 'игнорируемые части вопроса'}, {'text': 'У меня очень много вопросов', 'answer': 'игнор'}, {'text': 'У меня несколько много вопросов', 'answer': 'игнор'}, {'text': 'К вашим специалистам?', 'answer': 'Я могу вам помочь, напишите какая у вас проблема?'}]
