In [8]:
import json, os, chromadb, torch
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel

from crudChroma import CRUD

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

load_dotenv()
DB_PATH = os.getenv("DB_PATH")


In [9]:
crud = CRUD()

def generate_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

def collections_info(name_of_collection):
    collections = crud.retrieve_collection(name_of_collection)
    # note that get() is used on the collection object to get the items within the collection
    info = {}
    info['name'] = collections.name
    info['first_10'] = collections.peek()
    info['num_of_items'] = collections.count()
    return info

def query_collection(name_of_collection, query):
    collections = chromadb.PersistentClient(path=DB_PATH).get_collection(name=name_of_collection)
    # note that get() is used on the collection object to get the items within the collection
    info = {}
    info['name'] = collections.name
    info['query'] = query
    info['result'] = collections.query(query)
    return info

In [10]:
# T074T5Q38RW is a slack workspace collection
WORKSPACE_ID = "T074T5Q38RW"
for key, value in collections_info(WORKSPACE_ID).items():
        print(f"{key}: {value}")

name: T074T5Q38RW
first_10: {'ids': ['C073U7920TZ', 'C073XPDMR5L', 'C07425HL99C', 'C0746RT7336', 'C074B8HCVGU', 'C074ME7T38R', 'C074T5Q3T1N', 'C074XHL21UG', 'T074T5Q38RW'], 'embeddings': [[-0.2597765028476715, 0.17624498903751373, -0.36243361234664917, 0.2442445307970047, -0.016016297042369843, 0.013316541910171509, 0.18758851289749146, 0.2953212261199951, -0.3930635452270508, -0.1487136334180832, 0.260926216840744, -0.4235202670097351, -0.1470637172460556, 0.10293516516685486, -0.3600861430168152, 0.013980940915644169, 0.0747465267777443, -0.390112042427063, -0.3036209046840668, -0.12796956300735474, -0.02420898526906967, -0.018468176946043968, -0.26666906476020813, -0.0413568876683712, 0.15774105489253998, 0.14362874627113342, 0.02675849199295044, 0.3607291579246521, 0.13255952298641205, -0.28266993165016174, -0.16785262525081635, 0.14836497604846954, 0.45006826519966125, 0.1852218210697174, 0.007202755194157362, -0.07537223398685455, 0.27502042055130005, -0.4269167184829712, 0.05559

In [14]:
# channel collection
CHANNEL_ID = "C073U7920TZ"
query = "What is this about?"
embedded_query = generate_embedding(query, model, tokenizer)
result = crud.query_collection(CHANNEL_ID, embedded_query)

for key, value in result.items():
    print(f"{key}: {value}")



ids: [['1717165845.443969']]
distances: [[14.918123955610763]]
metadatas: [[{'\n            reactions is not added since it is a list; metadata has to be either\n            type str, int, float, or bool to be added to the document\n            replies': 0, 'datetime': '2024-05-31 10:30 PM', 'id': '1717165845.443969', 'person': 'Ray Li'}]]
embeddings: None
documents: [['<https://docs.google.com/document/d/1w2onKaECnFgkUlbZAyZN7q3qrmTYtCRq7xb1myh72-I/edit#heading=h.nblr0682qchk>']]
uris: None
data: None
included: ['metadatas', 'documents', 'distances']


In [13]:
# channel collection
collection = crud.retrieve_collection(CHANNEL_ID)
for message in collection.peek()["documents"]:
    print(message)

Ray Li has joined the channel
Jack Yang has joined the channel
Nihar Shah has joined the channel
Shayan Ahmad has joined the channel
Wei Kuo has joined the channel
Yuzhou Wang has joined the channel
Aiden Lee has joined the channel
Mariam Rukhaia has joined the channel
Kaiwen Guo has joined the channel
Yuzhou Wang <https://www.mdpi.com/2079-9292/13/1/178>
