# Chroma database

### Imports

In [34]:
import chromadb

### Connect to database

In [40]:
client = chromadb.HttpClient(host='localhost', port=8000)

### Create collection

In [41]:
from chromadb.utils import embedding_functions

collection = client.create_collection(name='my_collection',
                                      embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/paraphrase-albert-small-v2"),
                                      metadata={'hnsw:space': 'cosine'})

### Generate embeddings

(Already integrated in Chroma)

In [43]:
# Text strings to search from
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

### Insert embeddings

In [44]:
ids = [str(i) for i in range(len(docs))]
metadatas = [{'subject': 'history', 'text': doc} for doc in docs]

In [45]:
collection.add(
    ids=ids,
    documents=docs, # The engine calculates the embeddings of these documents with the previously specified function
    metadatas=metadatas
)

"""
One can also insert the embeddings manually, but for exploratory purposes we are
using the integrated embedding procedure of Chroma.
"""

'\nOne can also insert the embeddings manually, but for exploratory purposes we are\nusing the integrated embedding procedure of Chroma.\n'

### Perform search

In [46]:
collection.query(
    query_texts=["Who is Alan Turing?"],
    n_results=10,
)

{'ids': [['2', '1', '0']],
 'distances': [[0.4140054889208209, 0.48817410840373643, 0.871046350397331]],
 'embeddings': None,
 'metadatas': [[{'subject': 'history',
    'text': 'Born in Maida Vale, London, Turing was raised in southern England.'},
   {'subject': 'history',
    'text': 'Alan Turing was the first person to conduct substantial research in AI.'},
   {'subject': 'history',
    'text': 'Artificial intelligence was founded as an academic discipline in 1956.'}]],
 'documents': [['Born in Maida Vale, London, Turing was raised in southern England.',
   'Alan Turing was the first person to conduct substantial research in AI.',
   'Artificial intelligence was founded as an academic discipline in 1956.']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

### Perform filtering

In [47]:
# Insert more docs in another subject.
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]

ids = [str(i+3) for i in range(len(docs))]
metadatas = [{'subject': 'biology', 'text': doc} for doc in docs]

collection.add(
    ids=ids,
    documents=docs, # The engine calculates the embeddings of these documents with the previously specified function
    metadatas=metadatas
)

Perform filtering inside vector search

In [48]:
collection.query(
    query_texts=['tell me AI realted information'],
    n_results=10,
    where={'subject': 'biology'}
)

{'ids': [['4', '3', '5']],
 'distances': [[0.7945149957077619, 0.8367281867910206, 0.9274291505861108]],
 'embeddings': None,
 'metadatas': [[{'subject': 'biology',
    'text': 'Computational synthesis with AI algorithms predicts molecular properties.'},
   {'subject': 'biology',
    'text': 'Machine learning has been used for drug design.'},
   {'subject': 'biology',
    'text': 'DDR1 is involved in cancers and fibrosis.'}]],
 'documents': [['Computational synthesis with AI algorithms predicts molecular properties.',
   'Machine learning has been used for drug design.',
   'DDR1 is involved in cancers and fibrosis.']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Perform filtering alone

In [49]:
collection.get(
    where={'subject': 'biology'}
)

{'ids': ['3', '4', '5'],
 'embeddings': None,
 'metadatas': [{'subject': 'biology',
   'text': 'Machine learning has been used for drug design.'},
  {'subject': 'biology',
   'text': 'Computational synthesis with AI algorithms predicts molecular properties.'},
  {'subject': 'biology', 'text': 'DDR1 is involved in cancers and fibrosis.'}],
 'documents': ['Machine learning has been used for drug design.',
  'Computational synthesis with AI algorithms predicts molecular properties.',
  'DDR1 is involved in cancers and fibrosis.'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

In [50]:
collection.get()

{'ids': ['0', '1', '2', '3', '4', '5'],
 'embeddings': None,
 'metadatas': [{'subject': 'history',
   'text': 'Artificial intelligence was founded as an academic discipline in 1956.'},
  {'subject': 'history',
   'text': 'Alan Turing was the first person to conduct substantial research in AI.'},
  {'subject': 'history',
   'text': 'Born in Maida Vale, London, Turing was raised in southern England.'},
  {'subject': 'biology',
   'text': 'Machine learning has been used for drug design.'},
  {'subject': 'biology',
   'text': 'Computational synthesis with AI algorithms predicts molecular properties.'},
  {'subject': 'biology', 'text': 'DDR1 is involved in cancers and fibrosis.'}],
 'documents': ['Artificial intelligence was founded as an academic discipline in 1956.',
  'Alan Turing was the first person to conduct substantial research in AI.',
  'Born in Maida Vale, London, Turing was raised in southern England.',
  'Machine learning has been used for drug design.',
  'Computational synthe

### Delete data

In [51]:
collection.delete(ids=['0', '2'])

In [52]:
collection.get()

{'ids': ['1', '3', '4', '5'],
 'embeddings': None,
 'metadatas': [{'subject': 'history',
   'text': 'Alan Turing was the first person to conduct substantial research in AI.'},
  {'subject': 'biology',
   'text': 'Machine learning has been used for drug design.'},
  {'subject': 'biology',
   'text': 'Computational synthesis with AI algorithms predicts molecular properties.'},
  {'subject': 'biology', 'text': 'DDR1 is involved in cancers and fibrosis.'}],
 'documents': ['Alan Turing was the first person to conduct substantial research in AI.',
  'Machine learning has been used for drug design.',
  'Computational synthesis with AI algorithms predicts molecular properties.',
  'DDR1 is involved in cancers and fibrosis.'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

### Reconnect

In [53]:
del client

In [54]:
client = chromadb.HttpClient(host='localhost', port=8000)

In [55]:
collection = client.get_collection(name='my_collection')

In [56]:
collection.get()

{'ids': ['1', '3', '4', '5'],
 'embeddings': None,
 'metadatas': [{'subject': 'history',
   'text': 'Alan Turing was the first person to conduct substantial research in AI.'},
  {'subject': 'biology',
   'text': 'Machine learning has been used for drug design.'},
  {'subject': 'biology',
   'text': 'Computational synthesis with AI algorithms predicts molecular properties.'},
  {'subject': 'biology', 'text': 'DDR1 is involved in cancers and fibrosis.'}],
 'documents': ['Alan Turing was the first person to conduct substantial research in AI.',
  'Machine learning has been used for drug design.',
  'Computational synthesis with AI algorithms predicts molecular properties.',
  'DDR1 is involved in cancers and fibrosis.'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

### Drop collection

In [30]:
client.delete_collection(name='my_collection')