# Cell similarity example 

In [7]:
import numpy as np
import random
import tiledb
import tiledbsoma

from tiledb.vector_search.object_api import object_index
from tiledb.vector_search.object_readers import SomaAnnDataReader
from tiledb.vector_search.embeddings import SomaSCVIEmbedding

index_uri = "/tmp/soma_demo/index"
soma_uri = "tiledb://TileDB-Inc/soma-exp-tabula-sapiens-immune"
model_uri="https://cellxgene-contrib-public.s3.us-west-2.amazonaws.com/models/scvi/2024-02-12/homo_sapiens/model.pt"

config = tiledb.cloud.Config().dict()


# Create vector search index

In [2]:
# Clean up the index if it already exists
vfs = tiledb.VFS(config=config)
if vfs.is_dir(index_uri):
    vfs.remove_dir(index_uri)

# Index the tabula sapiens immune dataset
soma_reader = SomaAnnDataReader(
    uri=soma_uri,
    measurement_name="RNA",
    X_name="data",
    cells_per_partition=50000,
)

# Embed using scvi human model
cell_embedding = SomaSCVIEmbedding(
    model_uri=model_uri,
    gene_col="gene_id",
    embedding_dimensions=50,
)

# Create the index
index = object_index.create(
    uri=index_uri,
    index_type="IVF_FLAT",
    object_reader=soma_reader,
    embedding=cell_embedding,
    config=config,
)

# Embed cells and add them to the index 
index.update_index(
    max_tasks_per_stage=1,
    workers=1,
    config=config,
)

Global seed set to 0


[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               
[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               
[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               
[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               
[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               
[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               


# Query

Open the vector index

In [8]:
index = object_index.ObjectIndex(uri=index_uri, config=config)

Pick a random cell from the dataset

In [22]:
import pandas as pd
def tiledb_to_pandas(obs):
    new_obs = {}
    for column_name in column_names:
        new_obs[column_name] = obs[column_name][0]
    return pd.DataFrame(data=new_obs)

rid = random.randint(0,250000)

with tiledb.scope_ctx(ctx_or_config=config):
    query_cell=index.object_reader.read_objects_by_external_ids(np.array([rid]))
column_names=["soma_joinid", "cell_ontology_class", "tissue", "disease"]
query_cell['anndata'].obs[column_names]

Unnamed: 0,soma_joinid,cell_ontology_class,tissue,disease
0,130187,"cd4-positive, alpha-beta t cell",small intestine,normal


## Similarity query

In [23]:
distances, cells, obs = index.query(
    query_cell, 
    k=10, 
    nprobe=2, 
    return_objects=False
)
tiledb_to_pandas(obs)


[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               


Unnamed: 0,soma_joinid,cell_ontology_class,tissue,disease
0,130187,"cd4-positive, alpha-beta t cell",small intestine,normal
1,195340,t cell,bladder organ,normal
2,133937,"cd4-positive, alpha-beta t cell",small intestine,normal
3,130471,"cd4-positive, alpha-beta t cell",small intestine,normal
4,152943,"cd4-positive, alpha-beta t cell",bone marrow,normal
5,178448,"cd4-positive, alpha-beta memory t cell",spleen,normal
6,102741,"cd8-positive, alpha-beta memory t cell",spleen,normal
7,69848,t cell,skin of chest,normal
8,154410,t cell,adipose tissue,normal
9,134157,"cd4-positive, alpha-beta t cell",small intestine,normal


## Similarity query with structured restrict

Find similar cells that also pass a structured filter

In [24]:
with tiledb.scope_ctx(ctx_or_config=config):
    query_cell=index.object_reader.read_objects_by_external_ids(np.array([rid]))

In [25]:
# Only retrieve cells from the same tissue.
query_tissue=query_cell['anndata'].obs["tissue"].values[0]
def tissue_filter_fn(row):
    return row["tissue"] == query_tissue


distances, cells, obs = index.query(
    query_cell, 
    metadata_df_filter_fn=tissue_filter_fn, 
    k=10, 
    nprobe=2, 
    return_objects=False
)
tiledb_to_pandas(obs)

[34mINFO    [0m Found [1;36m98.725[0m% reference vars in query data.                                                               


Unnamed: 0,soma_joinid,cell_ontology_class,tissue,disease
0,130187,"cd4-positive, alpha-beta t cell",small intestine,normal
1,133937,"cd4-positive, alpha-beta t cell",small intestine,normal
2,130471,"cd4-positive, alpha-beta t cell",small intestine,normal
3,134157,"cd4-positive, alpha-beta t cell",small intestine,normal
4,129881,"cd4-positive, alpha-beta t cell",small intestine,normal
5,133286,"cd8-positive, alpha-beta t cell",small intestine,normal
6,135707,"cd4-positive, alpha-beta t cell",small intestine,normal
7,133879,"cd4-positive, alpha-beta t cell",small intestine,normal
8,135747,"cd4-positive, alpha-beta t cell",small intestine,normal
9,128924,"cd4-positive, alpha-beta t cell",small intestine,normal
