In [17]:
import h5py
import glob
import os
import numpy as np
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.stats import median_abs_deviation
import seaborn as sns
import umap
import anndata
import harmonypy as hm
import scanpy as sc

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt

In [None]:
# BICAN_help.py
def seurat_onestep_clust(adata, n_neighbors=None, resolution=0.3):
    print("Running normalization and clustering...")
    sc.pp.normalize_total(adata, target_sum=1e6)
    sc.pp.log1p(adata)
#     sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor='seurat_v3')
    adata = adata[:, adata.var.highly_variable]
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata, svd_solver='arpack')
    n_pcs = significant_pc_test(adata.obsm['X_pca'])
    
    if n_neighbors is None:
        n_neighbors = 15 if adata.n_obs < 200000 else (25 if adata.n_obs < 500000 else 50)
    print(f"Using {n_neighbors} neighbors and {n_pcs} PCs.")
    
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, metric='cosine')
    sc.tl.umap(adata, n_components=2)
    sc.tl.leiden(adata, resolution=resolution)
    return adata

def significant_pc_test(pcs, p_cutoff=0.1, min_pc=15):
    from scipy.stats import ks_2samp
    n_components = pcs.shape[1]
    for i in range(n_components - 1):
        pval = ks_2samp(pcs[:, i], pcs[:, i+1]).pvalue
        if pval > p_cutoff:
            break
    selected_pcs = min(i + 1, n_components)
    if selected_pcs < min_pc:
        print(f"Only {selected_pcs} PCs passed cutoff, using {min_pc}.")
        selected_pcs = min_pc
    return selected_pcs

def select_vg(adata):
    print("Perform variable features selection using Brennecke's method...")
    norm_dat = adata.X
    
    # Filter genes expressed in at least 10 cells with count > 1
    if issparse(norm_dat):
        gene_filter = np.array((norm_dat > 1).sum(axis=0)).flatten() >= 10
    else:
        gene_filter = (norm_dat > 1).sum(axis=0) >= 10
    filt_genes = adata.var_names[gene_filter]

    norm_count = adata[:, filt_genes].X
    if issparse(norm_count):
        norm_count.data = np.expm1(norm_count.data)
    else:
        norm_count = np.expm1(norm_count)

    # Placeholder for find_vg — Brennecke's method needs to be implemented here:
    vg = find_vg(norm_count, adata.var.loc[filt_genes])

    # Select significant variable genes
    sg = vg.loc[(vg['loess.padj'] < 0.5) | (vg['dispersion'] > 3), 'gene']
    print(f"Number of variable genes calculated by hicat method: {len(sg)}")

    # Keep top 3000 if more than 3000 genes
    if len(sg) > 3000:
        sg = sg.sort_values(['loess.padj', 'z'], ascending=[True, False]).head(3000)

    # Set variable genes in AnnData
    adata.var['highly_variable'] = adata.var_names.isin(sg)
    return adata

In [None]:
from scipy.stats import norm
from scipy.sparse import issparse
from statsmodels.nonparametric.smoothers_lowess import lowess

def rescale_samples(data):
    """Rescale each sample (column) based on its total counts relative to median total."""
    sample_totals = np.array(data.sum(axis=0)).flatten() if issparse(data) else data.sum(axis=0)
    scaling_factors = sample_totals / np.median(sample_totals)
    return data / scaling_factors

def gene_means(data):
    """Calculate row-wise means (genes x samples)."""
    return np.array(data.mean(axis=1)).flatten() if issparse(data) else data.mean(axis=1)

def gene_vars(data, means=None):
    """Calculate variance using mean of squares minus square of mean."""
    if means is None:
        means = gene_means(data)
    squared_data = data.multiply(data) if issparse(data) else data ** 2
    mean_squared = np.array(squared_data.mean(axis=1)).flatten() if issparse(squared_data) else squared_data.mean(axis=1)
    return mean_squared - means ** 2

def gene_dispersion(vars_, means):
    """Calculate dispersion (log10(variance / mean))."""
    return np.log10(vars_ / means)

def gene_z(dispersions):
    """Calculate z-scores based on interquartile range (25%-75%)."""
    iqr = np.percentile(dispersions[~np.isnan(dispersions)], [25, 75])
    mean_iqr = np.mean(iqr)
    delta = (iqr[1] - iqr[0]) / (norm.ppf(0.75) - norm.ppf(0.25))
    return (dispersions - mean_iqr) / delta

def gene_loess_fit(means, dispersions):
    """Fit Loess smoothing to log10(means) vs dispersions."""
    valid = (dispersions > 0) & (~np.isnan(dispersions))
    loess_fit = lowess(dispersions[valid], np.log10(means[valid]), frac=0.3, return_sorted=True)
    return loess_fit

def gene_loess_fit_z(loess_fit, means, dispersions):
    """Calculate z-scores of residuals from Loess fit."""
    predicted = np.interp(np.log10(means), loess_fit[:, 0], loess_fit[:, 1])
    residuals = dispersions - predicted
    return gene_z(residuals)

def find_vg(data, rescaled=False):
    """Main function: Brennecke's method for identifying variable genes."""
    if rescaled:
        data = rescale_samples(data)
    
    means = gene_means(data)
    vars_ = gene_vars(data, means)
    dispersions = gene_dispersion(vars_, means)
    z_scores = gene_z(dispersions)
    
    loess_fit = gene_loess_fit(means, dispersions)
    loess_z = gene_loess_fit_z(loess_fit, means, dispersions)
    
    pvals = 1 - norm.cdf(z_scores)
    padj = p_adjust_fdr(pvals)
    loess_pvals = 1 - norm.cdf(loess_z)
    loess_padj = p_adjust_fdr(loess_pvals)
    
    result = pd.DataFrame({
        'gene': np.arange(data.shape[0]),
        'g.means': means,
        'g.vars': vars_,
        'dispersion': dispersions,
        'z': z_scores,
        'pval': pvals,
        'padj': padj,
        'loess.z': loess_z,
        'loess.pval': loess_pvals,
        'loess.padj': loess_padj
    })
    
    return result

def p_adjust_fdr(pvals):
    """Benjamini-Hochberg FDR adjustment."""
    pvals = np.array(pvals)
    n = len(pvals)
    order = np.argsort(pvals)
    ranked = np.empty_like(order)
    ranked[order] = np.arange(1, n + 1)
    adj_pvals = pvals * n / ranked
    adj_pvals[adj_pvals > 1] = 1
    return adj_pvals


## Load all cells without clustering

In [18]:
mmg1 = sc.read('02.RNA/MiniAtlas_RNA_single_channel_clean_250423.h5ad')
mmg1

AnnData object with n_obs × n_vars = 528093 × 38369
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.ribo', 'doublet_score', 'dna', 'rna', 'region', 'structure', 'donor', 'target', 'rna_bc', 'atac_bc', 'region.color', 'region.name', 'structure.color'
    var: 'names'

In [19]:
mmg2 = sc.read('02.RNA/MiniAtlas_RNA_multiplex_clean_250423.h5ad')
mmg2

AnnData object with n_obs × n_vars = 406155 × 38369
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.ribo', 'doublet_score', 'dna', 'rna', 'region', 'structure', 'donor', 'target', 'rna_bc', 'atac_bc', 'index', 'region.color', 'region.name', 'structure.color'
    var: 'names'

In [21]:
import gzip
def parse_gtf(gtf_file):
    genes = []
    open_func = gzip.open if gtf_file.endswith('.gz') else open
    with open_func(gtf_file, 'rt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] == 'gene':
                info = fields[8]
                gene_id = extract_field(info, 'gene_id')
                gene_name = extract_field(info, 'gene_name')
                genes.append((gene_id, gene_name))
    return pd.DataFrame(genes, columns=['ensembl_id', 'gene_symbol'])

def extract_field(info_str, key):
    for field in info_str.strip().split(';'):
        if key in field:
            return field.split('"')[1]
    return None

mdf = parse_gtf('/projects/ps-renlab2/y2xie/ps-renlab/y2xie/projects/genome_ref/refdata-cellranger-GRCh38.p13_v43/GRCh38/genes/genes.gtf.gz')
mdf.head()

Unnamed: 0,ensembl_id,gene_symbol
0,ENSG00000290825,DDX11L2
1,ENSG00000243485,MIR1302-2HG
2,ENSG00000237613,FAM138A
3,ENSG00000290826,ENSG00000290826
4,ENSG00000186092,OR4F5


In [22]:
mmg = anndata.concat([mmg1, mmg2])
mmg.var['gene_symbol'] = mmg.var_names
bgene = mmg.var.merge(mdf, left_on='gene_symbol', right_on='gene_symbol')['ensembl_id'].isna()
print(bgene.sum())

0


In [8]:
mmg.var = mmg.var.merge(mdf,left_on='gene_symbol', right_on='gene_symbol')
mmg.var_names = mmg.var['ensembl_id']
mmg

AnnData object with n_obs × n_vars = 887233 × 38369
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.ribo', 'doublet_score', 'dna', 'rna', 'region', 'structure', 'donor', 'target', 'rna_bc', 'atac_bc', 'region.color', 'region.name', 'structure.color'
    var: 'gene_symbol', 'ensembl_id'

## Prepare data for MapMyCells

In [2]:
import copy
import json
import pathlib
import tempfile

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

In [3]:
os.environ['NUMEXPR_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

abc_data_dir = pathlib.Path('./ref/abc_atlas_data/')
scratch_dir = pathlib.Path('./analysis/04.clustering/')

In [41]:
### Download siletti2023 taxonomy
!wget https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/mapmycells/WHB-10Xv3/20240831/query_markers.n10.20240221800.json -O ./ref/abc_atlas_data/human_markers_240831.json
!wget https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/mapmycells/WHB-10Xv3/20240831/precomputed_stats.siletti.training.h5 -O ./ref/abc_atlas_data/precomputed_stats_ABC_revision_siletti_240831.h5

--2025-04-26 11:37:06--  https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/mapmycells/WHB-10Xv3/20240831/query_markers.n10.20240221800.json
Resolving allen-brain-cell-atlas.s3-us-west-2.amazonaws.com (allen-brain-cell-atlas.s3-us-west-2.amazonaws.com)... 3.5.82.182, 52.218.153.161, 3.5.78.158, ...
Connecting to allen-brain-cell-atlas.s3-us-west-2.amazonaws.com (allen-brain-cell-atlas.s3-us-west-2.amazonaws.com)|3.5.82.182|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2090976 (2.0M) [application/json]
Saving to: ‘/projects/ps-renlab2/y2xie/projects/BICAN/ref/abc_atlas_data/human_markers_240831.json’


2025-04-26 11:37:06 (6.02 MB/s) - ‘/projects/ps-renlab2/y2xie/projects/BICAN/ref/abc_atlas_data/human_markers_240831.json’ saved [2090976/2090976]

--2025-04-26 11:37:08--  https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/mapmycells/WHB-10Xv3/20240831/precomputed_stats.siletti.training.h5
Resolving allen-brain-cell-atlas.s3-us-west-2.amazonaws.c

## Map to taxonomy from matched brain regions

In [4]:
from cell_type_mapper.cli.from_specified_markers import (
    FromSpecifiedMarkersRunner
)

In [25]:
abc_cache = AbcProjectCache.from_cache_dir(abc_data_dir)
abc_cache.load_latest_manifest()

In [26]:
taxonomy_df = abc_cache.get_metadata_dataframe(
    directory='WHB-taxonomy',
    file_name='cluster_to_cluster_annotation_membership')

alias_to_truth = dict()
for cell in taxonomy_df.to_dict(orient='records'):
    alias = cell['cluster_alias']
    level = cell['cluster_annotation_term_set_label']
    node = cell['cluster_annotation_term_label']
    if alias not in alias_to_truth:
        alias_to_truth[alias] = dict()
    alias_to_truth[alias][level] = node

In [27]:
len(np.unique(taxonomy_df['cluster_annotation_term_label']))

3825

In [29]:
qregion = "Human " + np.unique(mmg.obs.region)
print(len(qregion))

# abc_cache.list_metadata_files(directory='WHB-10Xv3')
cell_metadata = abc_cache.get_metadata_dataframe(
    directory='WHB-10Xv3',
    file_name='cell_metadata'
)
print(len(np.intersect1d(qregion, np.unique(cell_metadata['region_of_interest_label']))))

18
18


In [30]:
### cluster to retain:
ctk = cell_metadata.loc[cell_metadata['region_of_interest_label'].isin(qregion)]
fctk = ctk[ctk['cluster_alias'].groupby(ctk['cluster_alias']).transform('count') > 0.0005 *ctk.shape[0]] ### 0.05% is very rare
print(len(np.unique(fctk['cluster_alias'])))

451


In [36]:
# create a dict mapping cluster_alias to taxons in the cell type taxonomy
alias_to_truth = dict()
for cell in taxonomy_df.to_dict(orient='records'):
    alias = cell['cluster_alias']
    level = cell['cluster_annotation_term_set_label']
    node = cell['cluster_annotation_term_label']
    if alias not in alias_to_truth:
        alias_to_truth[alias] = dict()
    alias_to_truth[alias][level] = node

In [37]:
valid_classes = set(
    [ ### what level to drop? 
        alias_to_truth[cl]['CCN202210140_SUPC']
        for cl in np.unique(fctk['cluster_alias'])
    ]
)
classes_to_drop = list(
    set(
        [alias_to_truth[cl]['CCN202210140_SUPC']
         for cl in alias_to_truth
         if alias_to_truth[cl]['CCN202210140_SUPC'] not in valid_classes]
    )
)

nodes_to_drop = [('supercluster', cl) for cl in classes_to_drop]
len(nodes_to_drop)

14

In [38]:
json_output_path = scratch_dir / 'json_mapping_output_subset_250426.json'
csv_output_path = scratch_dir / 'csv_mapping_output_subset_250426.csv'

mapping_config = {
    'query_path': str(test_h5ad_path),
    'extended_result_path': str(json_output_path),
    'csv_result_path': str(csv_output_path),
    'tmp_dir': str(scratch_dir),
    'max_gb': 16,
    'cloud_safe': False,
    'type_assignment': {
        'normalization': 'raw',
        'n_processors': 16,
        'chunk_size': 10000,
        'bootstrap_iteration': 100,
        'bootstrap_factor': 0.5,
        'rng_seed': 233211
    },
    'precomputed_stats': {
        'path': str(precompute_combined_path)
    },
    'query_markers': {
        'serialized_lookup': str(query_marker_path)
    },
    'nodes_to_drop': nodes_to_drop,
    'drop_level': 'CCN202210140_SUPC'
}

mapping_runner = FromSpecifiedMarkersRunner(
    args=[],
    input_data=mapping_config
)

mapping_runner.run()

=== Running Hierarchical Mapping 1.5.1 with config ===
{
  "extended_result_dir": null,
  "query_path": "02.RNA/MiniAtlas_RNA_merged_clean_250423.h5ad",
  "verbose_csv": false,
  "precomputed_stats": {
    "log_level": "ERROR",
    "path": "/projects/ps-renlab2/y2xie/projects/BICAN/ref/abc_atlas_data/precomputed_stats_ABC_revision_siletti_240831.h5"
  },
  "log_path": null,
  "max_gb": 16.0,
  "obsm_clobber": false,
  "drop_level": "CCN202210140_SUPC",
  "log_level": "ERROR",
  "map_to_ensembl": false,
  "tmp_dir": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering",
  "extended_result_path": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering/json_mapping_output_subset_250426.json",
  "csv_result_path": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering/csv_mapping_output_subset_250426.csv",
  "query_markers": {
    "collapse_markers": false,
    "log_level": "ERROR",
    "serialized_lookup": "/projects/ps-renlab2/y2xie/projects/BICAN/ref/ab



BENCHMARK: spent 4.1329e-01 seconds creating query marker cache
Running CPU implementation of type assignment.
260000 of 934248 cells in 7.16e+00 min; predict 1.86e+01 min of 2.57e+01 min left
390000 of 934248 cells in 1.02e+01 min; predict 1.42e+01 min of 2.44e+01 min left
520000 of 934248 cells in 1.31e+01 min; predict 1.04e+01 min of 2.36e+01 min left
640000 of 934248 cells in 1.61e+01 min; predict 7.39e+00 min of 2.35e+01 min left
760000 of 934248 cells in 1.89e+01 min; predict 4.34e+00 min of 2.33e+01 min left
880000 of 934248 cells in 2.17e+01 min; predict 1.34e+00 min of 2.30e+01 min left
BENCHMARK: spent 1.5486e+03 seconds assigning cell types
Writing marker genes to output file
FILE TRACKER: cleaning up ../file_tracker_bkq_i0km
MAPPING FROM SPECIFIED MARKERS RAN SUCCESSFULLY
CLEANING UP


In [None]:
cell_metadata.head()

In [None]:
### l2 annotation?
ffile = os.listdir('02.RNA/02.l2_clustering/MapMyCell_l2/')
ffile = [f for f in ffile if f.endswith('.cell')]
for name in ffile:
    fname = name.removesuffix(".qs.cell")
    cells = pd.read_csv(f'02.RNA/02.l2_clustering/MapMyCell_l2/{name}', sep = '\t', names = ['bc'])
    mmg[cells.bc,:].write_h5ad(f'02.RNA/02.l2_clustering/MapMyCell_l2/{fname}.h5ad')
    
    qregion = "Human " + np.unique(mmg[cells.bc,:].obs.region)
    ctk = cell_metadata.loc[cell_metadata['region_of_interest_label'].isin(qregion)]
    fctk = ctk[ctk['cluster_alias'].groupby(ctk['cluster_alias']).transform('count') > 0.0005 *ctk.shape[0]] ### 0.05% is very rare
    valid_classes = set(
    [ ### what level to drop? 
        alias_to_truth[cl]['CCN202210140_SUPC']
        for cl in np.unique(fctk['cluster_alias'])
    ]
    )
    classes_to_drop = list(
        set(
            [alias_to_truth[cl]['CCN202210140_SUPC']
             for cl in alias_to_truth
             if alias_to_truth[cl]['CCN202210140_SUPC'] not in valid_classes]
        )
    )

    nodes_to_drop = [('supercluster', cl) for cl in classes_to_drop]

    
    json_output_path = scratch_dir / f'02.RNA/02.l2_clustering/MapMyCell_l2/prediction/{fname}_json_mapping_output_subset_250426.json'
    csv_output_path = scratch_dir / f'02.RNA/02.l2_clustering/MapMyCell_l2/prediction/{fname}_csv_mapping_output_subset_250426.csv'
    mapping_config = {
    'query_path': str(test_h5ad_path),
    'extended_result_path': str(json_output_path),
    'csv_result_path': str(csv_output_path),
    'tmp_dir': str(scratch_dir),
    'max_gb': 16,
    'cloud_safe': False,
    'type_assignment': {
        'normalization': 'raw',
        'n_processors': 16,
        'chunk_size': 10000,
        'bootstrap_iteration': 100,
        'bootstrap_factor': 0.5,
        'rng_seed': 233211
    },
    'precomputed_stats': {
        'path': str(precompute_combined_path)
    },
    'query_markers': {
        'serialized_lookup': str(query_marker_path)
    },
    'nodes_to_drop': nodes_to_drop,
    'drop_level': 'CCN202210140_SUPC'
    }

    mapping_runner = FromSpecifiedMarkersRunner(args=[],input_data=mapping_config)
    mapping_runner.run()

=== Running Hierarchical Mapping 1.5.1 with config ===
{
  "extended_result_dir": null,
  "query_path": "02.RNA/MiniAtlas_RNA_merged_clean_250423.h5ad",
  "verbose_csv": false,
  "precomputed_stats": {
    "log_level": "ERROR",
    "path": "/projects/ps-renlab2/y2xie/projects/BICAN/ref/abc_atlas_data/precomputed_stats_ABC_revision_siletti_240831.h5"
  },
  "log_path": null,
  "max_gb": 16.0,
  "obsm_clobber": false,
  "drop_level": "CCN202210140_SUPC",
  "log_level": "ERROR",
  "map_to_ensembl": false,
  "tmp_dir": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering",
  "extended_result_path": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering/02.RNA/02.l2_clustering/MapMyCell_l2/prediction/MiniAtlas_RNA_merged_clean_250426_LAMP5-LHX6_and_Chandelier_json_mapping_output_subset_250426.json",
  "csv_result_path": "/projects/ps-renlab2/y2xie/projects/BICAN/analysis/04.clustering/02.RNA/02.l2_clustering/MapMyCell_l2/prediction/MiniAtlas_RNA_merged_clean_250426_L



BENCHMARK: spent 3.0030e-01 seconds creating query marker cache
Running CPU implementation of type assignment.
250000 of 934248 cells in 6.73e+00 min; predict 1.84e+01 min of 2.51e+01 min left
380000 of 934248 cells in 9.59e+00 min; predict 1.40e+01 min of 2.36e+01 min left
510000 of 934248 cells in 1.24e+01 min; predict 1.03e+01 min of 2.27e+01 min left
630000 of 934248 cells in 1.52e+01 min; predict 7.36e+00 min of 2.26e+01 min left
750000 of 934248 cells in 1.79e+01 min; predict 4.39e+00 min of 2.22e+01 min left
870000 of 934248 cells in 2.05e+01 min; predict 1.51e+00 min of 2.20e+01 min left
BENCHMARK: spent 1.4761e+03 seconds assigning cell types
Writing marker genes to output file
FILE TRACKER: cleaning up ../file_tracker_jygwrspv
MAPPING FROM SPECIFIED MARKERS RAN SUCCESSFULLY
CLEANING UP
=== Running Hierarchical Mapping 1.5.1 with config ===
{
  "extended_result_dir": null,
  "query_path": "02.RNA/MiniAtlas_RNA_merged_clean_250423.h5ad",
  "verbose_csv": false,
  "precomputed_s



BENCHMARK: spent 3.5393e-01 seconds creating query marker cache
Running CPU implementation of type assignment.
250000 of 934248 cells in 6.66e+00 min; predict 1.82e+01 min of 2.49e+01 min left
380000 of 934248 cells in 9.52e+00 min; predict 1.39e+01 min of 2.34e+01 min left
510000 of 934248 cells in 1.24e+01 min; predict 1.03e+01 min of 2.27e+01 min left
640000 of 934248 cells in 1.52e+01 min; predict 7.01e+00 min of 2.23e+01 min left
760000 of 934248 cells in 1.81e+01 min; predict 4.16e+00 min of 2.23e+01 min left
890000 of 934248 cells in 2.07e+01 min; predict 1.03e+00 min of 2.18e+01 min left
BENCHMARK: spent 1.4798e+03 seconds assigning cell types
Writing marker genes to output file
FILE TRACKER: cleaning up ../file_tracker_2k87i2n_
MAPPING FROM SPECIFIED MARKERS RAN SUCCESSFULLY
CLEANING UP
=== Running Hierarchical Mapping 1.5.1 with config ===
{
  "extended_result_dir": null,
  "query_path": "02.RNA/MiniAtlas_RNA_merged_clean_250423.h5ad",
  "verbose_csv": false,
  "precomputed_s



BENCHMARK: spent 3.5667e-01 seconds creating query marker cache
Running CPU implementation of type assignment.
260000 of 934248 cells in 6.80e+00 min; predict 1.76e+01 min of 2.44e+01 min left
390000 of 934248 cells in 9.73e+00 min; predict 1.36e+01 min of 2.33e+01 min left
520000 of 934248 cells in 1.27e+01 min; predict 1.01e+01 min of 2.29e+01 min left
650000 of 934248 cells in 1.57e+01 min; predict 6.85e+00 min of 2.25e+01 min left
770000 of 934248 cells in 1.86e+01 min; predict 3.96e+00 min of 2.25e+01 min left
890000 of 934248 cells in 2.12e+01 min; predict 1.05e+00 min of 2.22e+01 min left
BENCHMARK: spent 1.4953e+03 seconds assigning cell types
Writing marker genes to output file
FILE TRACKER: cleaning up ../file_tracker_mjupjc64
MAPPING FROM SPECIFIED MARKERS RAN SUCCESSFULLY
CLEANING UP
=== Running Hierarchical Mapping 1.5.1 with config ===
{
  "extended_result_dir": null,
  "query_path": "02.RNA/MiniAtlas_RNA_merged_clean_250423.h5ad",
  "verbose_csv": false,
  "precomputed_s



BENCHMARK: spent 3.0993e-01 seconds creating query marker cache
