In [1]:
from utils import *

import cellxgene_census
import cellxgene_census.experimental.ml as census_ml
import somacore as soma
from somacore import AxisQuery, ExperimentAxisQuery
import tiledbsoma
from tiledbsoma import Experiment, Measurement

In [2]:
census = cellxgene_census.open_soma()

#reference = ln.Collection.filter(uid="1gsdckxvOvIjQgeDVS1F").one().reference
reference = '283d65eb-dd53-496d-adb7-7570c7caa443'
query_collection_id = f"collection_id == '{reference}'"
datasets = (
    census["census_info"]["datasets"]
    .read(column_names=["dataset_id"], value_filter=query_collection_id)
    .concat().to_pandas()
)["dataset_id"].tolist()

datasets[:10]

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.


['8e10f1c4-8e98-41e5-b65f-8cd89a887122',
 'b165f033-9dec-468a-9248-802fc6902a74',
 'ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded',
 'fe1a73ab-a203-45fd-84e9-0f7fd19efcbd',
 'fbf173f9-f809-4d84-9b65-ae205d35b523',
 'fa554686-fc07-44dd-b2de-b726d82d26ec',
 'f9034091-2e8f-4ac6-9874-e7b7eb566824',
 'f8dda921-5fb4-4c94-a654-c6fc346bfd6d',
 'f7d003d4-40d5-4de8-858c-a9a8b48fcc67',
 'f6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181']

In [3]:
len(datasets)

138

In [4]:
def subset_census(query: ExperimentAxisQuery, output_base_dir: str) -> None:
    """
    Subset the census cube to the given query, returning a new cube.
    """
    with Experiment.create(uri=output_base_dir) as exp_subset:
        x_data = query.X(layer_name="raw").tables().concat()

        obs_data = query.obs().concat()
        # remove obs rows with no X data
        x_soma_dim_0_unique = pa.Table.from_arrays([x_data["soma_dim_0"].unique()], names=["soma_dim_0"])
        obs_data = obs_data.join(x_soma_dim_0_unique, keys="soma_joinid", right_keys="soma_dim_0", join_type="inner")
        obs = tiledbsoma.DataFrame.create(join(output_base_dir, "obs"), schema=obs_data.schema)
        obs.write(obs_data)
        exp_subset.set("obs", obs)

        ms = exp_subset.add_new_collection("ms")
        rna = ms.add_new_collection("RNA", Measurement)

        var_data = query.var().concat()
        var = rna.add_new_dataframe("var", schema=var_data.schema)
        var.write(var_data)

        x_type = x_data.schema.field_by_name("soma_data").type
        rna.add_new_collection("X")
        rna["X"].add_new_sparse_ndarray("raw", type=x_type, shape=(None, None))
        rna.X["raw"].write(x_data)

In [5]:
experiment = census["census_data"]["homo_sapiens"]
experiment

<Experiment 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'ms': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms' (unopened)
    'obs': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/obs' (unopened)>

In [6]:
def download_datasets(start=None, n=None):
    start = start or 0
    end = len(datasets) if n is None else start + n
    ds = datasets[start:end]
    err(f"Downloading {len(ds)} datasets:\n\t%s" % "\n\t".join(ds))
    query_datasets = f'dataset_id in {ds}'
    query = experiment.axis_query(
        "RNA",
        obs_query=AxisQuery(value_filter=query_datasets),
        var_query=AxisQuery(coords=(slice(20000-1),)),
    )
    
    output_base_dir = f'/mnt/nvme/census-benchmark_{start}:{end}'
    if exists(output_base_dir):
        err(f"Removing {output_base_dir}")
        rmtree(output_base_dir)
    
    subset_census(query, output_base_dir)

Just downloading the first dataset is OOMing a 128GB worker…

In [7]:
%%time
download_datasets(2, n=5)

Downloading 5 datasets:
	ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded
	fe1a73ab-a203-45fd-84e9-0f7fd19efcbd
	fbf173f9-f809-4d84-9b65-ae205d35b523
	fa554686-fc07-44dd-b2de-b726d82d26ec
	f9034091-2e8f-4ac6-9874-e7b7eb566824
  subset_census(query, output_base_dir)


CPU times: user 26min 9s, sys: 1min 1s, total: 27min 11s
Wall time: 2min 1s
