## 07_1. Myeloid -- Cell Annotation

<div 
    <p style="text-align: left;">Updated Time: 2025-02-14</p>
</div>

##### Load libraries

In [None]:
import os
import sys
import numpy as np
import pandas as pd

import omicverse as ov
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.pyplot import rc_context
from pyclustree import clustree
ov.plot_set()

import warnings
warnings.simplefilter("ignore")

##### Set working directory for analysis

In [None]:
cwd = '/media/bio/Disk/Research Data/EBV/omicverse'
os.chdir(cwd)
updated_dir = os.getcwd()
print("Updated working directory: ", updated_dir)

##### Reading in annotated AnnData object

In [None]:
adata = sc.read_h5ad("Processed Data/scRNA_Annotation.h5ad")
adata

In [None]:
for i in adata.obs['Cell_type'].cat.categories:
  number = len(adata.obs[adata.obs['Cell_type']==i])
  print('the number of category {} is {}'.format(i,number))

In [None]:
# Select myeloid cells for downstream analysis
adata_myeloid = adata[adata.obs['Cell_type'].isin(['Myeloid','Mast','pDC','Neutrophils'])].copy()
adata_myeloid

In [None]:
print(np.min(adata_myeloid.X), np.max(adata_myeloid.X))

In [None]:
adata_myeloid.obs['EBV_status'] = adata_myeloid.obs['EBV_status'].cat.reorder_categories(['Normal', 'Negative', 'Positive'])

In [None]:
for i in adata_myeloid.obs['EBV_status'].cat.categories:
  number = len(adata_myeloid.obs[adata_myeloid.obs['EBV_status']==i])
  print('the number of category {} is {}'.format(i,number))


#### Preprocessing

You can use `recover_counts` to recover the raw counts after normalize and log1p

In [None]:
X_counts_recovered, size_factors_sub=ov.pp.recover_counts(adata_myeloid.X, 50*1e4, 50*1e5, log_base=None, chunk_size=50000)
adata_myeloid.layers['counts']=X_counts_recovered

In [None]:
adata_myeloid.X=adata_myeloid.layers['counts']
print(np.min(adata_myeloid.X), np.max(adata_myeloid.X))

In [None]:
adata_myeloid=ov.pp.preprocess(adata_myeloid,mode='shiftlog|pearson',n_HVGs=2000,)
adata_myeloid.raw = adata_myeloid
adata_myeloid = adata_myeloid[:, adata_myeloid.var.highly_variable_features]
ov.pp.scale(adata_myeloid)
ov.pp.pca(adata_myeloid,layer='scaled',n_pcs=50)

In [None]:
ov.utils.plot_pca_variance_ratio(adata_myeloid)

#### Unsupervised clustering
The Leiden algorithm is as an improved version of the Louvain algorithm which outperformed other clustering methods for single-cell RNA-seq data analysis ([Du et al., 2018, Freytag et al., 2018, Weber and Robinson, 2016]). Since the Louvain algorithm is no longer maintained, using Leiden instead is preferred.

We, therefore, propose to use the Leiden algorithm[Traag et al., 2019] on single-cell k-nearest-neighbour (KNN) graphs to cluster single-cell datasets.

Leiden creates clusters by taking into account the number of links between cells in a cluster versus the overall expected number of links in the dataset.

Here, we set method='leiden' to cluster the cells using Leiden

In [None]:
ov.pp.neighbors(adata_myeloid, n_neighbors=15, n_pcs=10, use_rep='X_pca')

In [None]:
ov.pp.umap(adata_myeloid)

In [None]:
ov.utils.embedding(adata_myeloid,basis='X_umap',
                   color=['EBV_status'],
                   frameon='small',wspace=0.5)

In [None]:
ov.pl.cellproportion(
    adata=adata_myeloid,
    celltype_clusters='Cell_type',
    groupby='EBV_status',
    legend=True,
)

In [None]:
# Run leiden clustering for different resolutions
for resolution in [0.1, 0.2, 0.3, 0.4, 0.5]:
    ov.pp.leiden(
        adata_myeloid,
        resolution=resolution,
        flavor="igraph",
        n_iterations=2,
        key_added=f"leiden_{str(resolution).replace('.', '_')}",
    )

#### Plot the clustree

In [None]:
# Plot the clustree
fig = clustree(
    adata_myeloid,
    [f"leiden_{str(resolution).replace('.', '_')}" for resolution in [0.1, 0.2, 0.3, 0.4, 0.5]],
    title="Clustree of Myeloid Clusters",
    edge_weight_threshold=0.00,  # the minimum fraction of the parent cluster assigned to the child cluster to plot
    show_fraction=True,  # show the fraction of cells in each cluster
)
fig.set_size_inches(10, 8)
fig.set_dpi(100)

#### Adding cluster scoring

In [None]:
# adata_myeloid.obsm['X_pca']=adata_myeloid.obsm['X_harmony']

In [None]:
# Supported are Silhouette score, Calinski and Harabasz score and Davies-Bouldin score.

fig = clustree(
    adata_myeloid,
    [f"leiden_{str(resolution).replace('.', '_')}" for resolution in [0.1, 0.2, 0.3, 0.4, 0.5]],
    title="Clustree of Myeloid Clusters with Silhouette Score",
    score_clustering="silhouette",
    score_basis="pca",
)
fig.set_size_inches(10, 8)
fig.set_dpi(100)

In [None]:
# Supported are Silhouette score, Calinski and Harabasz score and Davies-Bouldin score.

fig = clustree(
    adata_myeloid,
    [f"leiden_{str(resolution).replace('.', '_')}" for resolution in [0.1, 0.2, 0.3, 0.4, 0.5]],
    title="Clustree of Myeloid Clusters with Calinski Harabasz Score",
    score_clustering="calinski_harabasz",
    score_basis="pca",
)
fig.set_size_inches(10, 8)
fig.set_dpi(100)

In [None]:
# Supported are Silhouette score, Calinski and Harabasz score and Davies-Bouldin score.

fig = clustree(
    adata_myeloid,
    [f"leiden_{str(resolution).replace('.', '_')}" for resolution in [0.1, 0.2, 0.3, 0.4, 0.5]],
    title="Clustree of Myeloid Clusters with Davies Bouldin Score",
    score_clustering="davies_bouldin",
    score_basis="pca",
)
fig.set_size_inches(10, 8)
fig.set_dpi(100)

Based on the cluster scoring，a resolution of 0.2 may be the optimal. Here we visualize the optimal clustering using UMAP representation:

In [None]:
from matplotlib import patheffects
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(4,4))

ov.pl.embedding(adata_myeloid,
                  basis='X_umap',
                  color=['leiden_0_2'],
                  show=False, legend_loc=None, add_outline=False, 
                  frameon='small',legend_fontoutline=2,ax=ax
                 )

ov.utils.gen_mpl_labels(
    adata_myeloid,
    'leiden_0_2',
    exclude=("None",),  
    basis='X_umap',
    ax=ax,
    adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),
    text_kwargs=dict(fontsize= 12 ,weight='bold',
                     path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),
)

In [None]:
ov.pl.cellproportion(
    adata=adata_myeloid,
    celltype_clusters='leiden_0_2',
    groupby='EBV_status',
    legend=True,
)

#### Finding marker genes

Let us compute a ranking for the highly differential genes in each cluster. For this, by default, the .raw attribute of AnnData is used in case it has been initialized before. The simplest and fastest method to do so is the t-test.

In [None]:
sc.tl.dendrogram(adata_myeloid,'leiden_0_2',use_rep='scaled|original|X_pca')

In [None]:
sc.tl.rank_genes_groups(adata_myeloid, 'leiden_0_2', use_rep='scaled|original|X_pca',
                        method='t-test',use_raw=False,key_added='leiden_0_2_ttest')
sc.pl.rank_genes_groups_dotplot(adata_myeloid,groupby='leiden_0_2',
                                cmap='Spectral_r',key='leiden_0_2_ttest',
                                standard_scale='var',n_genes=3)

Output the marker list as pandas dataframe:

In [None]:
ttest_marker_gene=pd.DataFrame(adata_myeloid.uns['leiden_0_2_ttest']['names'])
ttest_marker_gene.head()

In [None]:
ttest_marker_gene=ttest_marker_gene.head(50)
ttest_marker_gene.to_csv('Results/07.Myeloid/ttest_marker_gene_leiden_0_2_ttest.csv', index=False)

cosg is also considered to be a better algorithm for finding marker genes. Here, omicverse provides the calculation of cosg

Paper: Accurate and fast cell marker gene identification with COSG

Code: https://github.com/genecell/COSG

In [None]:
sc.tl.rank_genes_groups(adata_myeloid, groupby='leiden_0_2', 
                        method='t-test',use_rep='scaled|original|X_pca',)
ov.single.cosg(adata_myeloid, key_added='leiden_0_2_cosg', groupby='leiden_0_2')
sc.pl.rank_genes_groups_dotplot(adata_myeloid,groupby='leiden_0_2',
                                use_raw=False,
                                cmap='Spectral_r',key='leiden_0_2_cosg',
                                standard_scale='var',n_genes=3)

Output the marker list as pandas dataframe:

In [None]:
cosg_marker_gene=pd.DataFrame(adata_myeloid.uns['leiden_0_2_cosg']['names'])
cosg_marker_gene.head()

In [None]:
cosg_marker_gene=cosg_marker_gene.head(100)
cosg_marker_gene.to_csv('Results/07.Myeloid/cosg_marker_gene_leiden_0_2.csv', index=False)

##### Cell type annotation from marker genes

Based on the literature and existing knowledge, a feature dictionary was constructed by integrating the marker genes of each subpopulation obtained from the previous section, defining potential cell subtypes and their corresponding marker genes.

In [None]:
myeloid_genes_dict = ['KIT','TPSAB1','CPA3', # MAST
                      'LAMP3','CCR7','FSCN1', # cDC
                      'IRF7','IL3RA','LILRA4', # pDC
                      'CSF3R','S100A8','S100A9', # Neutrophil
                      'CD68','MRC1',
                      'CD86','CXCL9','TNF', # M1
                      'CD163','AIF1','TGFB1', #M2
                      'C1QA','C1QB','C1QC', #C1QC+ Macro
                      'IL1B','VEGFA','CCL20', #IL1B+ Macro
                      'SPP1','FBP1','GPNMB', #SPP1+ Macro
                      'CD14','FCGR3A', 'FCN1', #Mono
                      'IGHG1','IGKC','IGHG4',
                     ]

##### Dot plots

The dotplot visualization provides a compact way of showing per group, the fraction of cells expressing a gene (dot size) and the mean expression of the gene in those cell (color scale).

In [None]:
sc.pl.dotplot(adata_myeloid, myeloid_genes_dict, 'leiden_0_2', 
              dendrogram=True, use_raw=True, standard_scale='var',
              show=True)

**<span style="font-size:16px;">Create a dictionary to map cluster to annotation label</span>**

In [None]:
cluster2annotation = {
    '0': 'CD14+ Mono', 
    '1': 'Mast',
    '2': 'C1QC+ Macro', 
    '3': 'cDC',
    '4': 'IL1B+ Macro',
    '5': 'pDC',
    '6': 'cDC',
    '7': 'CD16+ Mono',
    '8':  'cDC',
    '9':  'pDC',
    '10': 'IgM+ plasma-like',
    '11': 'Mast',
    '12': 'SPP1+ Macro',
    '13': 'Neutrophil',
    '14': 'pDC',
}
adata_myeloid.obs['Myeloid_subtype'] = adata_myeloid.obs['leiden_0_2'].map(cluster2annotation).astype('category')
adata_myeloid.obs['Myeloid_subtype'] = adata_myeloid.obs['Myeloid_subtype'].cat.reorder_categories(['C1QC+ Macro','SPP1+ Macro','IL1B+ Macro','CD14+ Mono','CD16+ Mono', 
                                                                                                    'Mast','Neutrophil','cDC','pDC','IgM+ plasma-like'])

In [None]:
fig, ax = plt.subplots(figsize=(4,4))

ov.pl.embedding(adata_myeloid,
                  basis='X_umap',
                  color=['Myeloid_subtype'], 
                  palette='Paired',
                  show=False, legend_loc=None, add_outline=False, 
                  frameon='small',legend_fontoutline=2,ax=ax
                 )

ov.utils.gen_mpl_labels(
    adata_myeloid,
    'Myeloid_subtype',
    exclude=("None",),  
    basis='X_umap',
    ax=ax,
    adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),
    text_kwargs=dict(fontsize= 9,weight='bold',
                     path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),
)

plt.savefig('Results/07.Myeloid/07.Myeloid_subtype_UMAPplot.pdf', format='pdf')
plt.show()

#### Visualizing marker genes

Visualize marker genes using Dotplot

In [None]:
sc.pl.dotplot(adata_myeloid, myeloid_genes_dict, 'Myeloid_subtype', 
              dendrogram=True, use_raw=True, standard_scale='var',
              show=False)
plt.savefig('Results/07.Myeloid/07.Myeloid_subtype_marker_Dotplot.pdf', format='pdf')
plt.show()

In [None]:
sc.pl.matrixplot(
    adata_myeloid,
    myeloid_genes_dict,
    "Myeloid_subtype",
    dendrogram=True,
    colorbar_title="mean z-score",
    #layer="scaled",
    vmin=-5,
    vmax=5,
    cmap="RdBu_r",
    show=False)

plt.savefig('Results/07.Myeloid/07.Myeloid_subtype_marker_matrixplot.pdf', format='pdf')
plt.show()

### Compositional data visualization
Analyzing compositional data is not straightforward. scCODA provides some ways of visualizing the properties of a compositional dataset before analysis. We will showcase these functions on the data on pathogen infection of mice from *Haber et al. [2017]*.

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ov.pl.cellproportion(
    adata=adata_myeloid,
    celltype_clusters='Myeloid_subtype',
    groupby='EBV_status',
    legend=True,
    ax=ax,
)

legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
xticks = ax.get_xticks()
ax.set_xticks(xticks) 
ax.set_xticklabels([str(i+1) for i in range(len(xticks))])
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=90, labelsize=6.5)
plt.tight_layout()
plt.savefig("Results/07.Myeloid/07.Stacked_Barplot_of_Myeloid_Composition.pdf", format='pdf', dpi=300, bbox_inches='tight')
plt.show()

#### Save AnnData object with automated celltype annotation

In [None]:
adata_myeloid = adata_myeloid.raw.to_adata() # This recovers the raw count data in adata.X

In [None]:
adata_myeloid

In [None]:
print(np.min(adata_myeloid.X), np.max(adata_myeloid.X))

In [None]:
adata_myeloid.write_h5ad("Processed Data/scRNA_Myeloid.h5ad")


**<span style="font-size:16px;">Session information：</span>**

In [None]:
import sys
import platform
import pkg_resources

# Get Python version information
python_version = sys.version
# Get operating system information
os_info = platform.platform()
# Get system architecture information
architecture = platform.architecture()[0]
# Get CPU information
cpu_info = platform.processor()
# Print Session information
print("Python version:", python_version)
print("Operating system:", os_info)
print("System architecture:", architecture)
print("CPU info:", cpu_info)

# Print imported packages and their versions
print("\nImported packages and their versions:")
for package in pkg_resources.working_set:
    print(package.key, package.version)