This is a pseudocode workflow for hemocytes-cross-species analysis; for a detailed tutorial, please visit https://github.com/snap-stanford/saturn

P.japonicus

In [None]:
#qc
adata = sc.read_10x_h5(filename="./normal/outs/filtered_feature_bc_matrix.h5")
adata.var_names_make_unique()
adata.obs_names_make_unique()
mitochondrial_genes = [
    "ATP6", "ATP8", "COX1", "COX2", "COX3", "CYTB", "ND1", "ND2", "ND3", 
    "ND4", "ND4L", "ND5", "ND6",'unassigned_transcript_1835', 'unassigned_transcript_1836',
       'unassigned_transcript_1837', 'unassigned_transcript_1839',
       'unassigned_transcript_1840', 'unassigned_transcript_1841',
       'unassigned_transcript_1843', 'unassigned_transcript_1845',
       'unassigned_transcript_1846', 'unassigned_transcript_1850',
       'unassigned_transcript_1852', 'unassigned_transcript_1853',
       'unassigned_transcript_1854', 'unassigned_transcript_1855',
       'unassigned_transcript_1856', 'unassigned_transcript_1857',
       'unassigned_transcript_1859', 'unassigned_transcript_1862',
       'unassigned_transcript_1863', 'unassigned_transcript_1866',
       'unassigned_transcript_1868', 'unassigned_transcript_1869',
       'unassigned_transcript_1870', 'unassigned_transcript_1871'
]

adata.var["mt"] = adata.var_names.isin(mitochondrial_genes)

from scipy.sparse import issparse
if issparse(adata.X):
    adata.obs['nUMIs'] = adata.X.toarray().sum(axis=1)
    adata.obs['mito_perc'] = adata[:, adata.var["mt"]].X.toarray().sum(axis=1) / adata.obs['nUMIs'].values
    adata.obs['detected_genes'] = (adata.X.toarray() > 0).sum(axis=1)
else:
    adata.obs['nUMIs'] = adata.X.sum(axis=1)
    adata.obs['mito_perc'] = adata[:, adata.var["mt"]].X.sum(axis=1) / adata.obs['nUMIs'].values
    adata.obs['detected_genes'] = (adata.X > 0).sum(axis=1)

import matplotlib.pyplot as plt
mito_filter = 0.1
n_counts_filter = 4500

fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
sc.pl.scatter(adata, x='nUMIs', y='mito_perc', ax=axs[0], show=False)
sc.pl.scatter(adata, x='nUMIs', y='detected_genes', ax=axs[1], show=False)
axs[0].hlines(y=mito_filter, xmin=0, xmax=max(adata.obs['nUMIs']), color='red', ls='dashed')
axs[1].hlines(y=n_counts_filter, xmin=0, xmax=max(adata.obs['nUMIs']), color='red', ls='dashed')

fig.tight_layout()
plt.show()

In [None]:
#Annoataion
adata=ov.pp.qc(adata,tresh={'mito_perc': 0.1, 'nUMIs': 500, 'detected_genes': 250},)
adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=8000,target_sum=1e4)
adata = adata[:, adata.var.highly_variable_features]
ov.pp.scale(adata)
ov.pp.pca(adata,layer='scaled',n_pcs=50)
sc.pp.neighbors(adata, n_neighbors=15,
                n_pcs=50,use_rep='scaled|original|X_pca')
sc.tl.umap(adata)
sc.tl.leiden(adata,key_added='leiden',resolution=0.2)
ov.utils.embedding(adata,
                basis='X_umap',
                color=["leiden"],
                title=['leiden'],
                show=False,frameon='small',#legend_loc='on data'
                  )
sc.tl.rank_genes_groups(adata, groupby="leiden", method="t-test",key_added="rank_genes_groups",)
import pandas as pd
groups = adata.uns['rank_genes_groups']['names'].dtype.names
merged_df = pd.DataFrame()
for group in groups:
    ranked_genes_df = sc.get.rank_genes_groups_df(adata, group=group)
    ranked_genes_df = ranked_genes_df[['names', 'scores']]
    ranked_genes_df.rename(columns={'scores': f'{group}_score'}, inplace=True)
    if merged_df.empty:
        merged_df = ranked_genes_df
    else:
        merged_df = pd.merge(merged_df, ranked_genes_df, on='names', how='outer')
merged_df.to_csv ('./pjres0.2.txt',sep='\t')

In [None]:
cluster2annotation = {
     '1': 'ROS.pj',
     '2': 'ROS.pj',
         '3': 'Mucin+.pj',
     '4': 'INT.pj',
    '0': 'INT.pj',
    '5':'Prog.pj'
}
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster2annotation).astype('category')

P.clarkii

In [None]:
# MTgene
mitochondrial_genes = [
    "ATP6", "ATP8", "COX1", "COX2", "COX3", "CYTB", "ND1", "ND2", "ND3", 
    "ND4", "ND4L", "ND5", "ND6",
     "unassigned_gene_1",
"unassigned_gene_2",
"unassigned_gene_3",
"unassigned_gene_4",
"unassigned_gene_5",
"unassigned_gene_6",
"unassigned_gene_7",
"unassigned_gene_8",
"unassigned_gene_9",
"unassigned_gene_10",
"unassigned_gene_11",
"unassigned_gene_12",
"unassigned_gene_13",
"unassigned_gene_14",
"unassigned_gene_15",
"unassigned_gene_16",
"unassigned_gene_17",
"unassigned_gene_18",
"unassigned_gene_19",
"unassigned_gene_20",
"unassigned_gene_21",
"unassigned_gene_22",
"unassigned_gene_23",
"unassigned_gene_24",
]

SATURN

In [None]:
#Random drop
obs_df = adata.obs.copy()
import numpy as np
def subsample_clusters(obs, cluster_col, fraction=1/4):

    clusters = obs[cluster_col].unique()
    subsampled_indices = []
    
    for cluster in clusters:
        cluster_indices = obs.index[obs[cluster_col] == cluster].tolist()
        n_sample = max(1, int(len(cluster_indices) * fraction))  
        subsampled_indices.extend(np.random.choice(cluster_indices, n_sample, replace=False))
    
    return subsampled_indices

subsampled_indices = subsample_clusters(obs_df, 'cell_type')

adata_subsampled = adata[subsampled_indices].copy()

In [None]:
#Run SATURN
!python3 train-saturn.py --device_num 2 --in_label_col cell_type --ref_label_col cell_type --in_data=./bloodacross0819/bloodrun.csv --num_macrogenes=2000 --hv_genes=8000 --batch_size 6122 --work_dir=./bloodacross0819

In [None]:
!ls ./bloodacross0819/saturn_results

In [None]:
#Check UMAP
import scanpy as sc
import pickle
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.pca(adata, color="species", title="Species")
sc.pl.pca(adata, color="labels", title="Cell Type")

sc.pl.umap(adata, color="species", title="Species")
sc.pl.umap(adata, color="labels", title="Cell Type")

In [None]:
#Check marcogene
with open("./test256_data_dmel_pc_pj_pv_org_saturn_seed_0_genes_to_macrogenes.pkl", "rb") as f:
    macrogene_weights = pickle.load(f)

macrogene_adata = sc.AnnData(adata.obsm["macrogenes"])
macrogene_adata.obs = adata.obs
sc.tl.rank_genes_groups(macrogene_adata, groupby="ref_labels",method="wilcoxon")


import pandas as pd
groups = macrogene_adata.uns['rank_genes_groups']['names'].dtype.names
merged_df = pd.DataFrame()
for group in groups:
    ranked_genes_df = sc.get.rank_genes_groups_df(macrogene_adata, group=group)
    ranked_genes_df = ranked_genes_df[['names', 'scores']]
    ranked_genes_df.rename(columns={'scores': f'{group}_score'}, inplace=True)
    if merged_df.empty:
        merged_df = ranked_genes_df
    else:
        merged_df = pd.merge(merged_df, ranked_genes_df, on='names', how='outer')
merged_df.to_csv ('./macrogeneranksocre.txt',sep='\t')

In [None]:
import pandas as pd
import warnings

def get_scores(macrogene):
    scores = {}
    try:
        index = int(macrogene)  
    except ValueError:
        warnings.warn(f"Invalid macrogene index: {macrogene}")
        return pd.DataFrame(columns=["gene", "weight"]) 

    for gene, score in macrogene_weights.items():
        if index < len(score):
            scores[gene] = score[index]
        else:
            warnings.warn(f"Index {index} is out of bounds for gene {gene} with score size {len(score)}.")
    return pd.DataFrame(scores.items(), columns=["gene", "weight"]).sort_values("weight", ascending=False).head(10)

In [None]:
macrogene_id =1269 # macrogene ID
display(get_scores(macrogene_id))