## 05_3. Cellular Distributions and Differences

<div style="text-align: left;">
    <p style="text-align: left;">Updated Time: 2025-02-12</p>
</div>

##### Load libraries

In [None]:
import os
import sys
import numpy as np
import anndata
import scanpy as sc
import pandas as pd
import pertpy as pt
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import despine

import omicverse as ov
ov.plot_set()

import matplotlib.pyplot as plt
from matplotlib import patheffects

from PyComplexHeatmap import *
use_pch_style() # or plt.style.use('default') to restore default style
import random

import warnings
warnings.simplefilter("ignore") 

##### Set working directory for analysis

In [None]:
cwd = '/media/bio/Disk/Research Data/EBV/omicverse'
os.chdir(cwd)
updated_dir = os.getcwd()
print("Updated working directory: ", updated_dir)

##### Reading in annotated AnnData object

In [None]:
adata = sc.read_h5ad("Processed Data/scRNA_Annotation.h5ad")
adata.obs['Pass QC'] = "Yes"
adata

In [None]:
# annotate the group of mitochondrial genes as "mt"
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)

In [None]:
plt.figure(figsize=(6, 3))

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0,
    multi_panel=True,
    show=False, 
)

plt.savefig('Results/02.data_preprocessing/02.data_preprocessing_after_QC.png', format='png')
plt.show()

In [None]:
adata_unfiltered = sc.read_h5ad("Processed Data/scRNA_unfiltered.h5ad")
adata_unfiltered.obs['Pass QC'] = "No"
adata_unfiltered

In [None]:
# annotate the group of mitochondrial genes as "mt"
adata_unfiltered.var["mt"] = adata_unfiltered.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    adata_unfiltered, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)

In [None]:
plt.figure(figsize=(6, 3))

sc.pl.violin(
    adata_unfiltered,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0,
    multi_panel=True,
    show=False, 
)

plt.savefig('Results/02.data_preprocessing/02.data_preprocessing_before_QC.png', format='png')
plt.show()

In [None]:
adata_unfiltered.obs['Pass QC'] = adata_unfiltered.obs['Pass QC'].astype(str)  # 先转为字符串类型
adata_unfiltered.obs.loc[adata.obs.index, 'Pass QC'] = adata.obs['Pass QC']

In [None]:
adata_unfiltered.obs['Pass QC'] = adata_unfiltered.obs['Pass QC'].astype('category')
categories = adata_unfiltered.obs['Pass QC'].cat.categories
colors = sc.pl.palettes.default_102[:len(categories)]
adata_unfiltered.uns['Pass QC_colors'] = colors

adata_unfiltered.obs['orig.ident'] = adata_unfiltered.obs['orig.ident'].astype('category')
categories = adata_unfiltered.obs['orig.ident'].cat.categories
colors = sc.pl.palettes.default_102[:len(categories)]
adata_unfiltered.uns['orig.ident_colors'] = colors

In [None]:
fig, ax = plt.subplots(figsize=(8, 3))
ov.pl.cellproportion(
    adata=adata_unfiltered, 
    celltype_clusters='Pass QC',
    groupby='orig.ident', 
    legend=True,
    ax=ax
)

legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
xticks = ax.get_xticks()
ax.set_xticks(xticks)  
ax.set_xticklabels([str(i+1) for i in range(len(xticks))]) 
ax.set_xlabel('')
ax.set_ylabel('Cell fractions Pass QC')
ax.tick_params(axis='x', rotation=90, labelsize=6.5)
plt.tight_layout()
plt.savefig('Results/02.data_preprocessing/02.data_preprocessing_QC.pdf', format='pdf')
plt.show()

In [None]:
for i in adata.obs['Cell_type'].cat.categories:
  number = len(adata.obs[adata.obs['Cell_type']==i])
  print('the number of category {} is {}'.format(i,number))

In [None]:
adata.obs['EBV_status'] = adata.obs['EBV_status'].astype('category')
adata.obs['EBV_status'] = adata.obs['EBV_status'].cat.reorder_categories(['Normal', 'Negative','Positive'])

for i in adata.obs['EBV_status'].cat.categories:
  number = len(adata.obs[adata.obs['EBV_status']==i])
  print('the number of category {} is {}'.format(i,number))

In [None]:
categories = adata.obs['orig.ident'].cat.categories
colors = sc.pl.palettes.default_102[:len(categories)]
adata.uns['orig.ident_colors'] = colors

In [None]:
fig, ax = plt.subplots(figsize=(5,5))

ov.pl.embedding(adata,
                basis='X_umap',
                color=['Cell_type'], 
                palette='Paired',
                show=False, legend_loc=None, add_outline=False, 
                frameon='small',legend_fontoutline=2,ax=ax
                 )

ov.utils.gen_mpl_labels(
    adata,
    'Cell_type',
    exclude=("None",),  
    basis='X_umap',
    ax=ax,
    adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),
    text_kwargs=dict(fontsize= 9,weight='bold',
    path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),
)
plt.title('',fontsize=10)
plt.savefig("Results/05.celltype_annotation/05. UMAP_Major_Cell_Type.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
ov.pl.embedding(adata,
                basis='X_umap',
                color=['EPCAM', 'COL1A1', 'CD3D', 'KLRD1', 'MS4A1', 'LYZ', 'MZB1', 'TPSAB1', 'LILRA4', 'CSF3R'],
                frameon='small',
                vmax='p99.2',
                ncols=5,
                show=False,)

In [None]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import scanpy as sc

genes = ['EPCAM','COL1A1','CD3D','KLRD1','MS4A1',
         'LYZ','MZB1','TPSAB1','LILRA4','CSF3R']

out = "Results/05.celltype_annotation/05.UMAP_Major_Cell_Type_MarkerGene"
os.makedirs(os.path.dirname(out), exist_ok=True)

# optional: embed editable fonts in PDF
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype']  = 42

# build fixed grid; use constrained_layout to avoid tight_layout
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(18, 7), constrained_layout=True)
axes = axes.ravel()

for i, g in enumerate(genes):
    sc.pl.umap(
        adata,
        color=g,
        vmax='p99.2',
        frameon='small',
        cmap="coolwarm",
        ax=axes[i],
        show=False
    )
    axes[i].set_title(g, fontsize=10)

# hide any extra axes if genes < 10
for j in range(len(genes), len(axes)):
    axes[j].axis('off')

fig.savefig(out + ".pdf", bbox_inches="tight", dpi=300)
plt.close(fig)


In [None]:
levels = adata.obs['orig.ident'].cat.categories

id_map = {level: i+1 for i, level in enumerate(levels)}
adata.obs['ID'] = adata.obs['orig.ident'].map(id_map)
adata.obs['ID'] = adata.obs['ID'].astype(str)

ov.pl.embedding(adata,
                basis='X_umap',
                color=['ID'], 
                #palette='Paired',
                show=False, legend_loc=None, add_outline=False, 
                frameon='small',legend_fontoutline=2,ax=ax
                 )

In [None]:
fig, ax = plt.subplots(figsize=(8, 3))
ov.pl.cellproportion(
    adata=adata, 
    celltype_clusters='Cell_type',
    groupby='ID', 
    legend=True,
    ax=ax
)

legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
xticks = ax.get_xticks()
#ax.set_xticks(xticks)  
#ax.set_xticklabels([str(i+1) for i in range(len(xticks))]) 
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=90, labelsize=6.5)
plt.tight_layout()

plt.savefig('Results/05.celltype_annotation/05.major_celltype_by_ID.pdf', format='pdf')
plt.show()

In [None]:
df = adata.obs.copy()

df_unique = df.drop_duplicates(subset='ID', keep='first')
df_unique = df_unique.sort_values(by='EBV_status')

conditions = [
    df_unique['Age'].isna(),
    df_unique['Age'] <= 45,
    df_unique['Age'] > 45
]
choices = ['Missing', '≤45', '>45']
df_unique['Age2'] = np.select(conditions, choices, default='Missing')

df_unique.index = df_unique['ID']
df_unique.head()

In [None]:
count_df = df.groupby(['ID', 'Cell_type']).size().reset_index(name='count')
count_df['percentage'] = count_df['count'] / count_df.groupby('ID')['count'].transform('sum')
wide_df = count_df.pivot(index='ID', columns='Cell_type', values='percentage').fillna(0).reset_index()
wide_df = wide_df.set_index('ID')
wide_df.columns.name = None 
ordered_ids = df_unique['ID'].tolist()
wide_df = wide_df.loc[ordered_ids]
wide_df.head()

In [None]:
count_df = df.groupby(['ID']).size().reset_index(name='count')
count_df.index = count_df['ID']
ordered_ids = df_unique['ID'].tolist()
count_df = count_df.loc[ordered_ids]
count_df.head()

In [None]:
cell_types = adata.obs['Cell_type'].cat.categories.tolist()
cell_type_colors = adata.uns['Cell_type_colors']  # 是个 list
cell_type_color_dict = dict(zip(cell_types, cell_type_colors))
cell_type_color_dict

stage_color_dict = dict(zip(['Normal','I','II','III','IV'], sns.color_palette('Set3', n_colors=5).as_hex()))
stage_color_dict

Age_color_dict = dict(zip(['≤45', '>45', 'Missing'], sns.color_palette('tab20', n_colors=3).as_hex()))
Age_color_dict

In [None]:
plt.figure(figsize=(15, 5))
col_ha = HeatmapAnnotation(
    # label=anno_label(df_unique['EBV_status'], merge=True,rotation=0, colors= {"Normal": "#E41A1C",  "Negative": "#377EB8",  "Positive": "#4DAF4A"}),
                           EBV_status=anno_simple(df_unique['EBV_status'], add_text=True,text_kws={'color':'white', 'fontsize':'14'}, cmap='Set1', legend=False),
                           No_of_cells=anno_barplot(count_df['count'],height=3.5, cmap='magma'),
                           Sex=anno_simple(df_unique['Sex'], height=1.6,  cmap='Set2'),
                           Age=anno_simple(df_unique['Age2'], height=1.6,  colors=Age_color_dict),
                           Stage=anno_simple(df_unique['Stage'], height=1.6,  colors=stage_color_dict),
                           Cell_fractions=anno_barplot(wide_df,legend=True, height=20, ylim=(0, 1.03), colors=cell_type_color_dict),
                           plot=True,legend=True,legend_gap=3, wgap=0.2, hgap=1)
col_ha.show_ticklabels(df_unique.index.tolist(),rotation=270)

plt.savefig('Results/05.celltype_annotation/05.major_celltype_by_Characteristics.pdf', format='pdf')
plt.show()

In [None]:
fig,ax=plt.subplots(figsize = (5,5))
colors = sns.color_palette("Set2", n_colors=len(adata.obs['EBV_status'].unique()))
ov.pl.embedding(adata,
                basis='X_umap',
                color='EBV_status',
                frameon='small',
                palette=colors,
                show=False,
                ax=ax,)
plt.title('',fontsize=10)

plt.savefig("Results/05.celltype_annotation/05. UMAP_EBV_status.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
ov.pl.cellproportion(
    adata=adata, 
    celltype_clusters='Cell_type',
    groupby='EBV_status', 
    legend=False,
    ax=ax
)

ax.set_xlabel('')
ax.tick_params(axis='x', rotation=0, labelsize=9) 
plt.tight_layout()

plt.savefig('Results/05.celltype_annotation/05.major_celltype_by_EBV.pdf', format='pdf')
plt.show()

#### Distance metrics

In [None]:

adata_ebv = adata[adata.obs['EBV_status'].isin(['Negative','Positive'])].copy()
adata_ebv.raw = adata_ebv.copy() # This saves the raw count data in adata.raw

In [None]:
adata_ebv.obs['EBV_status'] = adata_ebv.obs['EBV_status'].cat.reorder_categories(['Negative','Positive'])

Distance metrics between groups of single cells measured by the module pertpy.tl.Distance.

In [None]:
distance = pt.tl.Distance("euclidean", obsm_key="scaled|original|X_pca")

In [None]:
adata_epi = adata_ebv[adata_ebv.obs["Cell_type"] == "Epithelial"]
adata_epi.obs["EBV_status"].value_counts()
df_epi = distance.pairwise(adata_epi, groupby="EBV_status", show_progressbar=False)
print(df_epi)

In [None]:
adata_fir = adata_ebv[adata_ebv.obs["Cell_type"] == "Fibroblasts"]
adata_fir.obs["EBV_status"].value_counts()
df_fir = distance.pairwise(adata_fir, groupby="EBV_status", show_progressbar=False)
print(df_fir)

In [None]:
adata_t = adata_ebv[adata_ebv.obs["Cell_type"] == "T"]
adata_t.obs["EBV_status"].value_counts()
df_t = distance.pairwise(adata_t, groupby="EBV_status", show_progressbar=False)
print(df_t)

In [None]:
adata_nk = adata_ebv[adata_ebv.obs["Cell_type"] == "NK"]
adata_nk.obs["EBV_status"].value_counts()
df_nk = distance.pairwise(adata_nk, groupby="EBV_status", show_progressbar=False)
print(df_nk)

In [None]:
adata_b = adata_ebv[adata_ebv.obs["Cell_type"] == "B"]
adata_b.obs["EBV_status"].value_counts()
df_b = distance.pairwise(adata_b, groupby="EBV_status", show_progressbar=False)
print(df_b)

In [None]:
adata_mye = adata_ebv[adata_ebv.obs["Cell_type"] == "Myeloid"]
adata_mye.obs["EBV_status"].value_counts()
df_mye = distance.pairwise(adata_mye, groupby="EBV_status", show_progressbar=False)
print(df_mye)

In [None]:
adata_pla = adata_ebv[adata_ebv.obs["Cell_type"] == "Plasma"]
adata_pla.obs["EBV_status"].value_counts()
df_pla = distance.pairwise(adata_pla, groupby="EBV_status", show_progressbar=False)
print(df_pla)

In [None]:
adata_mas = adata_ebv[adata_ebv.obs["Cell_type"] == "Mast"]
adata_mas.obs["EBV_status"].value_counts()
df_mas = distance.pairwise(adata_mas, groupby="EBV_status", show_progressbar=False)
print(df_mas)

In [None]:
adata_pdc = adata_ebv[adata_ebv.obs["Cell_type"] == "pDC"]
adata_pdc.obs["EBV_status"].value_counts()
df_pdc = distance.pairwise(adata_pdc, groupby="EBV_status", show_progressbar=False)
print(df_pdc)

In [None]:
adata_neu = adata_ebv[adata_ebv.obs["Cell_type"] == "Neutrophils"]
adata_neu.obs["EBV_status"].value_counts()
df_neu = distance.pairwise(adata_neu, groupby="EBV_status", show_progressbar=False)
print(df_neu)

In [None]:
# Creating the DataFrame with the given categories and values
categories = ['Epithelial', 'Fibroblasts', 'T','NK','B','Myeloid','Plasma','Mast','pDC','Neutrophils']
values = [32.406464, 18.446907, 5.777791, 7.285064, 4.803761, 10.269726, 6.19246, 7.852163, 7.921268, 8.944743]

data = pd.DataFrame({'Cell type': categories, 'E-distance': values})

In [None]:
# Optimizing the plot with the legend moved into the plot area
plt.figure(figsize=(4.5, 3.5))

# Generate a color palette for the categories using 'Paired'
colors = plt.cm.Paired(np.linspace(0, 1, len(data)))

# Create the bar chart
bars = plt.bar(data['Cell type'], data['E-distance'], color=colors)

# Adding labels
plt.yticks(fontsize=11)
plt.ylabel('E-distance', fontsize=12)
plt.title('Distance metrics between EBV- and EBV+ groups', fontsize=11)

# Removing x-axis ticks and labels
plt.xticks([])
plt.xlabel('')  # Ensure no x-axis label

# Adding a legend inside the plot area
plt.legend(
    bars, 
    data['Cell type'], 
    loc='upper right', 
    title="", 
    fontsize=8,
    frameon=True
)

# Removing grid
plt.grid(False)

plt.tight_layout()

plt.savefig('Results/05.celltype_annotation/05.E-distance_of_celltype_by_EBV_status.pdf', format='pdf')
plt.show()


**<span style="font-size:16px;">Session information：</span>**

In [None]:
import sys
import platform
import pkg_resources

# Get Python version information
python_version = sys.version
# Get operating system information
os_info = platform.platform()
# Get system architecture information
architecture = platform.architecture()[0]
# Get CPU information
cpu_info = platform.processor()
# Print Session information
print("Python version:", python_version)
print("Operating system:", os_info)
print("System architecture:", architecture)
print("CPU info:", cpu_info)

# Print imported packages and their versions
print("\nImported packages and their versions:")
for package in pkg_resources.working_set:
    print(package.key, package.version)