In [14]:
import scanpy as sc
# pip install symphonypy
import symphonypy as sp
import pandas as pd


In [11]:
# path settings
atlas_path = "/bmbl_data/cankun_notebook/loss_y/GC_sample.rds.h5ad"
query_path = "/bmbl_data/cankun_notebook/loss_y/Final_seurat_object.qs.h5ad"
query_path = "/bmbl_data/cankun_notebook/loss_y/Final_seurat_object.small.h5ad"
clusters_to_transfer = 'celltype'
atlas_batch_key = 'sample' # Change to 'batch' ?

output_label = "/bmbl_data/cankun_notebook/SymphonyPy_Labels.csv"

In [3]:
adata_ref = sc.read_h5ad(atlas_path)
adata_query = sc.read_h5ad(query_path)

In [4]:
adata_ref

AnnData object with n_obs × n_vars = 1105 × 27176
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'scrublet_scores', 'scrublet_predict', 'sample', 'study', 'percent.mt', 'manual_doublet', 'RNA_snn_res.0.5', 'seurat_clusters', 'celltype', 'cohort', 'patient_recode', 'celltype.big', 'Gender', 'Source', 'Type', 'Age', 'Lauren.s.classification', 'Primary.site', 'MSI.status', 'H..pylori', 'Signet.ring.cell.carcinoma', 'The.WHO.classification', 'Prior.treatment', 'loy_avg', 'housekeeping_avg', 'ratio_Y_housekeeping', 'is_fLOY'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'

# Step 1: Reference building

In [5]:
#sc.pp.normalize_total(adata_ref, target_sum=1e5)
sc.pp.log1p(adata_ref)
sc.pp.highly_variable_genes(
    adata_ref,
    batch_key=atlas_batch_key,
    n_top_genes=2000,
)
adata_ref.raw = adata_ref
adata_ref = adata_ref[:, adata_ref.var.highly_variable]
sc.pp.scale(adata_ref, max_value=10)
sc.pp.pca(adata_ref, n_comps=30, zero_center=False)

# You can skip Harmony if you have only one batch in reference
sp.pp.harmony_integrate(adata_ref, key=atlas_batch_key)  
# -> adata_ref.obsm["X_pca_harmony"] <- Harmony adjusted "X_pca"
# -> adata_ref.uns["harmony"] <- Harmony object for Symphony

  view_to_actual(adata)
  view_to_actual(adata)
2023-09-06 12:29:51,472 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-09-06 12:29:52,743 - harmonypy - INFO - sklearn.KMeans initialization complete.


# Step 2: Query preprocessing and Symphony

In [6]:
# Symphony
sp.tl.map_embedding(adata_query, adata_ref, key=atlas_batch_key)
# -> adata_query.obsm["X_pca_harmony"] <- Symphony adjusted query's PCA
sp.tl.per_cell_confidence(adata_query, adata_ref)
# -> adata_query.obs["symphony_per_cell_dist"] <- Symphony mapping score per cell

82 out of 2000 genes from the reference are missing in the query dataset or have zero std in the reference, their expressions in the query will be set to zero


# Step 3: Label transfer


In [7]:
sp.tl.transfer_labels_kNN(adata_query, adata_ref, clusters_to_transfer)


# Step 4: Transfer Atlas's UMAP into Query Data

In [8]:
sc.pp.neighbors(adata_ref, use_rep="X_pca_harmony")
sc.tl.umap(adata_ref)
sp.tl.ingest(adata_query, adata_ref)
# -> adata_query.obsm["X_umap"] <- mapped to the reference's UMAP coords

# Step 5: save output labels and send to Yuzhou/Cankun

In [15]:
umap_df = pd.DataFrame(adata_query.obsm["X_umap"], columns=["UMAP1", "UMAP2"], index=adata_query.obs.index)

# Add the 'celltype' column to this DataFrame
umap_df[clusters_to_transfer] = adata_query.obs[clusters_to_transfer]

# Save the DataFrame to a CSV file
umap_df.to_csv(output_label)
