In [1]:
import chromadb
client = chromadb.Client()

collection_name = "filter_example_collection"
collection = client.create_collection(name=collection_name)

collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
    ],
    documents=["A document that discusses domestic policy", "A document that discusses international affairs", "A document that discusses kittens", "A document that discusses dogs", "A document that discusses chocolate", "A document that is sixth that discusses government", "A document that discusses international affairs", "A document that discusses global affairs"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

ChromaDB도 DB다 보니 여러 필터링들을 제공하는데, 문장 검색 답게 contain, or, and 등 단어 포함과 관련된 필터링들을 api로 제공한다

In [2]:
collection.get(where={"status": "read"}, where_document={"$contains": "affairs"})

{'ids': ['id7'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}],
 'documents': ['A document that discusses international affairs'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

affairs가 포함된 문서를 찾아서 반환한다.

In [3]:
collection.get(where_document={"$or": [{"$contains": "global affairs"}, {"$contains": "domestic policy"}]})

{'ids': ['id1', 'id8'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}, {'status': 'unread'}],
 'documents': ['A document that discusses domestic policy',
  'A document that discusses global affairs'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

affairs가 포함되어있거나 domestic policy가 포함된 문서를 찾아서 제공한다

In [6]:
collection.query(query_embeddings=[[1, 3, 1]], where_document={"$contains": "affairs"}, n_results=5)

{'ids': [['id7', 'id2', 'id8']],
 'distances': [[5.340000152587891, 39.02000045776367, 39.02000045776367]],
 'metadatas': [[{'status': 'read'},
   {'status': 'unread'},
   {'status': 'unread'}]],
 'embeddings': None,
 'documents': [['A document that discusses international affairs',
   'A document that discusses international affairs',
   'A document that discusses global affairs']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Embedding vector가 [1, 3, 1] 가 가깝고, affairs가 포함된 문서들을 추출한다

In [7]:
collection.query(query_embeddings=[[0, 0, 0]], where_document={"$not_contains": "domestic policy"}, n_results=5)

{'ids': [['id3', 'id5', 'id7', 'id6', 'id8']],
 'distances': [[16.740001678466797,
   16.740001678466797,
   16.740001678466797,
   87.22000122070312,
   87.22000122070312]],
 'metadatas': [[{'status': 'read'},
   {'status': 'read'},
   {'status': 'read'},
   {'status': 'unread'},
   {'status': 'unread'}]],
 'embeddings': None,
 'documents': [['A document that discusses kittens',
   'A document that discusses chocolate',
   'A document that discusses international affairs',
   'A document that is sixth that discusses government',
   'A document that discusses global affairs']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Embedding vector가 [1, 3, 1] 가 가깝고, affairs가 포함되지 않은 문서들을 추출한다

## 논리 연산자 기반 필터링

In [6]:
client = chromadb.Client()
collection = client.get_or_create_collection("test-where-list")
collection.add(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john"}, {"author": "jack"}, {"author": "jill"}], ids=["1", "2", "3"])

collection.get(where={"$or": [{"author": "john"}, {"author": "jack"}]})

*************** EP Error ***************
EP Error D:\a\_work\1\s\onnxruntime\python\onnxruntime_pybind_state.cc:456 onnxruntime::python::RegisterTensorRTPluginsAsCustomOps Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************


{'ids': ['1', '2'],
 'embeddings': None,
 'metadatas': [{'author': 'john'}, {'author': 'jack'}],
 'documents': ['Article by john', 'Article by Jack'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

metadata에서 author가 john 또는 jack 인 2개의 문서를 추출한다

In [7]:
# And Logical Operator Filtering
collection = client.get_or_create_collection("test-where-list")
collection.upsert(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john","category":"chroma"}, {"author": "jack","category":"ml"}, {"author": "jill","category":"lifestyle"}], ids=["1", "2", "3"])
collection.get(where={"$and": [{"category": "chroma"}, {"author": "john"}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

chromadb는 기존 데이터에 대해 upsert(update+insert)도 지원하고, meta데이터를 바탕으로 and 연산자를 적용하여 문서를 추출한다

In [8]:
collection = client.get_or_create_collection("test-where-list")
collection.upsert(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john","category":"chroma"}, {"author": "jack","category":"ml"}, {"author": "jill","category":"lifestyle"}], ids=["1", "2", "3"])
collection.get(where={"$and": [{"category": "chroma"}, {"author": "john"}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [9]:
collection.get(where={"$and": [{"category": "chroma"}, {"author": "jill"}]})

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

문서가 없는 경우에는 None을 반환한다

In [10]:
collection.get(where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [11]:
collection.get(where_document={"$contains": "Article"},where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}