# Notebook for Annotation of clusters after batch correction using `bbknn`

**Created by :** Srivalli Kolla

**Created on :** 02 May, 2025

**Modified on :** 02 May, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sb
import datetime
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib import rcParams

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()

plt.rcParams['figure.dpi'] = 300  
plt.rcParams['savefig.dpi'] = 300

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

# Data import

In [None]:
adata = sc.read_h5ad('./Github/ACM_sn_2025/data/acm_bbknn_batch_corrected_01_05_25.h5ad')
adata

#### Check if data is raw or Normalized

In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

In [None]:
print(X_is_raw(adata))

# Annotation

In [None]:
adata.X= adata.layers['cpm_normalization']
print(X_is_raw(adata))

## Data Visualization

In [None]:
sc.pl.umap(adata,color= ['Sample_Name', 'Sex', 'Genotype', 'Treatment', 'Condition', 'Sample_ID', 'n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'doublet_scores', 'XIST-percentage', 'gender_check_cov', 'phase'], frameon = False,layer = 'cpm_normalization', cmap = 'RdYlBu_r' )

## Marker Genes

In [8]:
marker_genes = {'Ventricular Cardiomyocytes' : ['Myh7',' Myl2',' Fhl2'],
 'Atrial Cardiomyocytes' :['Nppa',' Myl7',' Myl4'],
 'Fibroblasts': ['Dcn',' Gsn',' Pdgfra'],
 'Endothelial Cells' :['Vwf',' Pecam1',' Cdh5'],
 'Pericytes' :['Rgs5',' Abcc9',' Kcnj8'],
 'Smooth Muscle Cells' :['Myh11',' Tagln',' Acta2'],
 'Myeloid Immune Cells' : ['Cd14',' C1qa',' Cd68'],
 'Lymphoid Immune Cells' :['Cd8a',' Il7r',' Cd40lg'],
 'Adipocytes' : ['Gpam',' Fasn',' Lep'],
 'Neuronal Cells' :['Plp1',' Nrxn1',' Nrxn3'],
 'Mesothelial Cells' :['Msln',' Wt1',' Bnc1']
}

In [9]:
marker_genes_in_data = {}
for ct, markers in marker_genes.items():
    markers_found = []
    for marker in markers:
        if marker in adata.var.index:
            markers_found.append(marker)
    marker_genes_in_data[ct] = markers_found

## Plotting

In [None]:
for cell_type, genes in marker_genes.items():
    
    cleaned_genes = [g.strip() for g in genes if g.strip() in adata.var_names]

    if cleaned_genes:
        print(f"{cell_type.upper()}:\n  → Plotting: {', '.join(cleaned_genes)}\n")
        sc.pl.umap(
            adata,
            color=cleaned_genes,
            vmin=0,
            vmax="p99",
            sort_order=False,
            frameon=False,
            cmap="RdYlBu_r", layer = 'cpm_normalization'
        )
    else:
        print(f"{cell_type.upper()}:\n  ✗ No valid marker genes found in adata.var_names.\n")

    print("\n" + "-"*60 + "\n")

## Leiden Clustering

In [None]:
sc.tl.leiden(adata, resolution=1, key_added="leiden_1")

In [None]:
sc.tl.leiden(adata, resolution=0.1, key_added="leiden_0.1")

In [None]:
sc.tl.leiden(adata, resolution=0.2, key_added="leiden_0.2")

In [None]:
sc.tl.leiden(adata, resolution=0.3, key_added="leiden_0.3")

In [None]:
sc.tl.leiden(adata, resolution=0.5, key_added="leiden_0.5")

In [None]:
sc.pl.umap(adata, color=["leiden_0.1","leiden_0.2","leiden_0.3","leiden_0.5","leiden_1"],frameon= False,legend_loc="on data")

# Cluster Annotation

In [39]:
cl_annotation = {
"0" : "Endothelial Cells",
"1" : "Venticular Cardiomyocytes",
"2" : "Venticular Cardiomyocytes",
"3" : "Fibroblasts",
"4" : "Myeloid Immune Cells",
"5" : "Endothelial Cells + + Neuronal Cells",
"6" : "Fibroblasts + Endothelial Cells",
"7" : "Pericytes + Smooth Muscle Cells" ,
"8" : "Endothelial Cells",
"9" : "Endothelial Cells + Neuronal Cells",
"10" : "Lymphoid Immune Cells",
"11" : "Venticular Cardiomyocytes + Atrial Cardiomyocytes + Fibroblasts ",
"12" : "Venticular Cardiomyocytes + Atrial Cardiomyocytes + Fibroblasts"
}

In [40]:
adata.obs["manual_celltype_annotation_specific"] = adata.obs['leiden_0.3'].map(cl_annotation)

In [None]:
sc.pl.umap(adata, color = 'leiden_0.3',frameon= False, legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = ["manual_celltype_annotation_specific"], frameon = False)

In [43]:
cl_annotation2 = {
"0" : "Endothelial Cells",
"1" : "Venticular Cardiomyocytes",
"2" : "Venticular Cardiomyocytes",
"3" : "Fibroblasts",
"4" : "Myeloid Immune Cells",
"5" : "Mixed Cell Types",
"6" : "Mixed Cell Types",
"7" : "Pericytes + Smooth Muscle Cells" ,
"8" : "Endothelial Cells",
"9" : "Mixed Cell Types",
"10" : "Lymphoid Immune Cells",
"11" : "Mixed Cell Types",
"12" : "Mixed Cell Types"
}

In [44]:
adata.obs["manual_celltype_annotation_broad"] = adata.obs['leiden_0.3'].map(cl_annotation2)

In [None]:
sc.pl.umap(adata, color = ["manual_celltype_annotation_broad"], frameon = False)

# Differentially Expressed Genes (DEGs)

In [None]:
sc.tl.rank_genes_groups(adata, groupby="leiden_0.3", method="wilcoxon",use_raw= False, key_added = 'dea_leiden')

In [None]:
sc.tl.dendrogram(adata,groupby='leiden_0.3')
sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden_0.3',standard_scale ='var',n_genes= 5, key='dea_leiden',values_to_plot= 'scores')

In [None]:
deg_df = pd.DataFrame(adata.uns["dea_leiden"]["names"]).head(100)  
print(deg_df)

In [47]:
deg_df.to_csv(f'./Github/ACM_sn_2025/data/DE_genes_bbknn_toppfun_{timestamp}.csv',sep=',')

# Cluster Annotation - based on DEGs

In [52]:
toppfun_annotation = {
"0":"Endothelial",
"1":"Cytoplasmic Cardiomyocyte",
"2":"Ventricular Cardiomyocyte",
"3":"Fibroblasts",
"4":"Macrophages",
"5":"Endothelial",
"6":"Macrophages",
"7":"Mesenchymal cells",
"8":"Endothelial",
"9":"Neuronal cells",
"10":"Unclear",
"11":"Unclear",
"12":"Unclear",
}

In [53]:
adata.obs["toppfun_annotation"] = adata.obs['leiden_0.3'].map(toppfun_annotation)

In [None]:
sc.pl.umap(adata, color = ["toppfun_annotation"], frameon = False)

In [55]:
adata.write_h5ad(f'./Github/ACM_sn_2025/data/acm_manual_anno_{timestamp}.h5ad')