In [1]:
import somacore as soma
import time
import gc
from tqdm import tqdm

import cellxgene_census
import cellxgene_census.experimental.ml as census_ml

In [2]:
census = cellxgene_census.open_soma()

#reference = ln.Collection.filter(uid="1gsdckxvOvIjQgeDVS1F").one().reference
reference = '283d65eb-dd53-496d-adb7-7570c7caa443'
query_collection_id = f"collection_id == '{reference}'"
datasets =(census["census_info"]["datasets"]
           .read(column_names=["dataset_id"], value_filter=query_collection_id)
           .concat().to_pandas())["dataset_id"].tolist()
query_datasets = "dataset_id in " + str(datasets)

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.


In [3]:
query_datasets

"dataset_id in ['8e10f1c4-8e98-41e5-b65f-8cd89a887122', 'b165f033-9dec-468a-9248-802fc6902a74', 'ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded', 'fe1a73ab-a203-45fd-84e9-0f7fd19efcbd', 'fbf173f9-f809-4d84-9b65-ae205d35b523', 'fa554686-fc07-44dd-b2de-b726d82d26ec', 'f9034091-2e8f-4ac6-9874-e7b7eb566824', 'f8dda921-5fb4-4c94-a654-c6fc346bfd6d', 'f7d003d4-40d5-4de8-858c-a9a8b48fcc67', 'f6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181', 'f5a04dff-d394-4023-8811-65494e8bb11d', 'f502c312-05dc-4fd4-a762-92a63e92b539', 'f3565fda-499a-4d20-bd92-563c09954c42', 'f32c2c13-bb1a-4ffd-a457-60b64ecfa4cb', 'f20f44ef-a0d4-4d94-87de-037fd47141f0', 'f16f4108-7873-4035-9989-3748da1a7ff1', 'ed33c203-233a-476a-a56b-28da945fdd32', 'ed11cc3e-2947-407c-883c-c53b043917c3', 'ecd9230d-c571-4dab-abd3-8b54c74833f0', 'e8681d74-ac9e-4be5-be14-1cf1bbd54dd7', 'e6b2ce27-681b-4409-a053-2681875936e5', 'e5b1115b-a486-49bb-bda3-8261822836e0', 'e4710a02-8abc-48d5-a3e8-9ae7e9d79bdb', 'e1f595f6-ba2c-495e-9bee-7056f116b1e4', 'dd03ce70-3243-4c96-9561

In [4]:
len(query_datasets)

5534

In [5]:
census["census_data"]

<Collection 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data' (open for 'r') (2 items)
    'homo_sapiens': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens' (unopened)
    'mus_musculus': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/mus_musculus' (unopened)>

In [6]:
experiment = census["census_data"]["homo_sapiens"]
experiment

<Experiment 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'ms': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms' (unopened)
    'obs': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/obs' (unopened)>

In [7]:
experiment.ms['RNA']

<Measurement 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms/RNA' (open for 'r') (4 items)
    'obsm': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms/RNA/obsm' (unopened)
    'feature_dataset_presence_matrix': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms/RNA/feature_dataset_presence_matrix' (unopened)
    'X': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms/RNA/X' (unopened)
    'var': 's3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens/ms/RNA/var' (unopened)>

In [8]:
BATCH_SIZE = 1024

In [9]:
experiment_datapipe = census_ml.ExperimentDataPipe(
    experiment,
    measurement_name="RNA",
    X_name="raw",
    obs_query=soma.AxisQuery(value_filter=query_datasets),
    var_query=soma.AxisQuery(coords=(slice(20000-1),)),
    batch_size=BATCH_SIZE,
    shuffle=True,
    soma_chunk_size=10000,
)

loader = census_ml.experiment_dataloader(experiment_datapipe)

In [10]:
n_epochs = 1
# n_epochs = 5

In [11]:
def benchmark(loader, n_samples = None):    
    loader_iter = loader.__iter__()
    # exclude first batch from benchmark as this includes the setup time
    batch = next(loader_iter)
    
    num_iter = n_samples // BATCH_SIZE if n_samples is not None else None
    
    start_time = time.time()
    
    batch_times = []
    batch_time = time.time()
    
    total = num_iter if num_iter is not None else len(loader_iter)
    for i, batch in tqdm(enumerate(loader_iter), total=total):
        X = batch["x"] if isinstance(batch, dict) else batch[0] 
        # for pytorch DataLoader
        # Merlin sends to cuda by default
        if hasattr(X, "is_cuda") and not X.is_cuda:
            X = X.cuda()
        
        if num_iter is not None and i == num_iter:
            break
        if i % 10 == 0:
            gc.collect()
        
        batch_times.append(time.time() - batch_time)
        batch_time = time.time()
    
    execution_time = time.time() - start_time
    gc.collect()
    
    time_per_sample = (1e6 * execution_time) / (total * BATCH_SIZE)
    print(f'time per sample: {time_per_sample:.2f} μs')
    samples_per_sec = total * BATCH_SIZE / execution_time
    print(f'samples per sec: {samples_per_sec:.2f} samples/sec')
    
    return samples_per_sec, time_per_sample, batch_times

In [12]:
%%time
experiment_datapipe.shape

CPU times: user 10 s, sys: 5.58 s, total: 15.6 s
Wall time: 7.12 s


(10107657, 20000)

In [13]:
print("cellxgene_census")
for epoch in range(n_epochs):
    samples_per_sec, time_per_sample, batch_times = benchmark(loader, n_samples=experiment_datapipe.shape[0])
    results["cellxgene_census"][f"epoch_{epoch}"]["time_per_sample"] = time_per_sample
    results["cellxgene_census"][f"epoch_{epoch}"]["samples_per_sec"] = samples_per_sec
    results["cellxgene_census"][f"epoch_{epoch}"]["batch_times"] = batch_times

cellxgene_census


  0%|                                                                                                       | 0/9870 [00:00<?, ?it/s]


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
census.close()