# Cell similarity example 

In [1]:
import numpy as np
import random
import tiledb
import tiledbsoma

from tiledb.vector_search.object_api import object_index
from tiledb.vector_search.object_readers import SomaAnnDataReader
from tiledb.vector_search.embeddings import SomaScGPTEmbedding

index_uri = "/tmp/soma_demo/index"
soma_uri = "tiledb://TileDB-Inc/tabula-sapiens-immune"
model_uri = "soma/scGPT_human"
config = tiledb.cloud.Config().dict()


# Create vector search index

In [None]:
vfs = tiledb.VFS(config=config)
if vfs.is_dir(index_uri):
    vfs.remove_dir(index_uri)
vfs.create_dir(index_uri)

# Index liver cells from the tabula sapiens immune dataset
soma_reader = SomaAnnDataReader(
    uri=soma_uri,
    measurement_name="RNA",
    X_name="data",
    obs_value_filter="organ_tissue == 'Liver'",
    cells_per_partition=1000,
    max_size=1000
)

# Embed using scGPT human model
cell_embedding = SomaScGPTEmbedding(
    model_uri=model_uri,
    gene_col="var_id",
)

# Create the index
index = object_index.create(
    uri=index_uri,
    index_type="IVF_FLAT",
    object_reader=soma_reader,
    embedding=cell_embedding,
    config=config,
)

# Embed cells and add them to the index 
index.update_index(
    max_tasks_per_stage=1,
    workers=1,
    config=config,
)

# Query

Open the vector index

In [3]:
index = object_index.ObjectIndex(uri=index_uri, config=config)

Pick a random cell from the dataset

In [10]:
import pandas as pd
def tiledb_to_pandas(obs):
    new_obs = {}
    for column_name in column_names:
        new_obs[column_name] = obs[column_name][0]
    return pd.DataFrame(data=new_obs)

rid = random.randint(0,2000)
query_cell = index.object_reader.read_objects_by_external_ids(np.array([rid]))
column_names=["soma_joinid", "cell_ontology_class", "organ_tissue", "n_genes"]
query_metadata = index.object_reader.exp.obs.read(coords=[rid], column_names=column_names).concat().to_pandas()
query_metadata

Unnamed: 0,soma_joinid,cell_ontology_class,organ_tissue,n_genes
0,218,macrophage,Liver,1387


## Similarity query

In [11]:
distances, cells, obs = index.query(
    query_cell, 
    k=10, 
    nprobe=2, 
    return_objects=False
)
tiledb_to_pandas(obs)


Embedding cells: 100%|██████████| 1/1 [00:31<00:00, 31.94s/it]


Unnamed: 0,soma_joinid,cell_ontology_class,organ_tissue,n_genes
0,218,macrophage,Liver,1387
1,183,macrophage,Liver,1883
2,141,macrophage,Liver,1484
3,359,macrophage,Liver,1701
4,585,macrophage,Liver,1548
5,999,macrophage,Liver,1560
6,204,macrophage,Liver,1377
7,815,monocyte,Liver,1525
8,652,macrophage,Liver,1737
9,237,macrophage,Liver,1584


## Similarity query with structured restrict

Find similar cells that also pass a structured filter

In [12]:
# Only retrieve cells with similar number of genes.
query_n_genes=query_metadata["n_genes"].values[0]
def tissue_filter_fn(row):
    return 0.9 < row['n_genes'] / query_n_genes < 1.1


distances, cells, obs = index.query(
    query_cell, 
    metadata_df_filter_fn=tissue_filter_fn, 
    k=10, 
    nprobe=2, 
    return_objects=False
)
tiledb_to_pandas(obs)

Embedding cells: 100%|██████████| 1/1 [00:31<00:00, 31.59s/it]


Unnamed: 0,soma_joinid,cell_ontology_class,organ_tissue,n_genes
0,218,macrophage,Liver,1387
1,141,macrophage,Liver,1484
2,204,macrophage,Liver,1377
3,555,macrophage,Liver,1374
4,815,monocyte,Liver,1525
5,344,macrophage,Liver,1374
6,340,macrophage,Liver,1269
7,243,macrophage,Liver,1303
8,955,macrophage,Liver,1472
9,233,macrophage,Liver,1445
