### Find Markers

In [None]:
import scanpy as sc
import pandas as pd
import os

# ----------------------------------
# Step: Run DE analysis for clusters
# ----------------------------------
sc.tl.rank_genes_groups(
    adata,
    groupby='leiden',
    method='wilcoxon',
    key_added='rank_genes'  # store results under this key
)

# ---------------------
# Step: Preview results
# ---------------------
df_markers = sc.get.rank_genes_groups_df(adata, group=None, key='rank_genes')
print(df_markers.head(20))

In [None]:
# --------------------------------
# Step: Visualize top marker genes
# --------------------------------
sc.pl.rank_genes_groups(
    adata,
    key='rank_genes',
    n_genes=30,
    sharey=False
)

# ------------------------
# Step: Cluster dendrogram
# ------------------------
sc.tl.dendrogram(adata, groupby='leiden')

# ---------------------------------
# Step: Dotplot of top marker genes
# ---------------------------------
sc.pl.rank_genes_groups_dotplot(
    adata,
    key='rank_genes',
    use_raw=True,
    values_to_plot="logfoldchanges",
    min_logfoldchange=1,
    n_genes=5,
    cmap='coolwarm',
    vmin=-3,
    vmax=3
)

In [None]:
# --------------------------------------
# Step: Filter and save top marker genes
# --------------------------------------
# Filter marker genes based on adjusted p-value
df_filtered = df_markers[df_markers['pvals_adj'] < 0.05]

# Select top n markers per cluster (ranked by scores and logfoldchanges)
n = 30
df_top = (df_filtered.groupby('group').apply(lambda x: x.nlargest(n, ['scores', 'logfoldchanges'])).reset_index(drop=True))

# Ensure the output directory exists
output_dir = "./sc_result"
os.makedirs(output_dir, exist_ok=True)

# Save selected top marker genes to an Excel file
output_file = os.path.join(output_dir, f"marker_genes_top{n}.xlsx")
df_top.to_excel(output_file, index=False)

print(f"Top {n} marker genes per cluster have been saved to: {output_file}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# ----------------------------------------
# Plot: Show selected marker genes on UMAP
# ----------------------------------------
genes = ['CD3D', 'CD4', 'FOXP3', 'CD8A', 'NKG7', 'EPCAM']  # Example gene list
n_per_row = 3  # Number of subplots per row

n_rows = int(np.ceil(len(genes) / n_per_row))
fig, axes = plt.subplots(n_rows, n_per_row, figsize=(6*n_per_row, 6*n_rows))
axes = np.array(axes).reshape(n_rows, n_per_row)  # Ensure 2D array for indexing

for i, gene in enumerate(genes):
    row = i // n_per_row
    col = i % n_per_row

    sc.pl.umap(
        adata,
        color=gene,
        ax=axes[row, col],
        show=False,
        size=10,
        frameon=False,
        vmax=np.percentile(adata.raw[:, gene].X.toarray(), 99.5)
    )

    axes[row, col].set_title(f"{gene}", fontsize=14)

# Hide unused subplots if total genes not multiple of n_per_row
total_subplots = n_rows * n_per_row
for j in range(len(genes), total_subplots):
    row = j // n_per_row
    col = j % n_per_row
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

### Basic Annotation

In [None]:
# -----------------------------------------
# Step: Define cluster-to-cell type mapping
# -----------------------------------------
cluster_annotation = {
    "0": "Cell_type_0",
    "1": "Cell_type_1",
    "2": "Cell_type_2",
    "3": "Cell_type_3",
    # 可以继续添加其他 cluster
}

# -------------------------------
# Step: Apply annotation to adata
# -------------------------------
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_annotation)

# ------------------------------
# Step: Summarize cluster counts
# ------------------------------
print("Cluster annotation summary:")
for cluster, cell_type in cluster_annotation.items():
    count = (adata.obs['leiden'] == cluster).sum()
    print(f"Cluster {cluster}: {cell_type} ({count} cells)")

# -------------------------
# Step: Save annotated data
# -------------------------
adata.write("adata_after_annotation.h5ad", compression='gzip')
print("Save annotated adata. *adata.raw contains log1p normalized counts")

### Major Type Annotation

In [None]:
# ------------------------------------
# Step: Define major cell type mapping
# ------------------------------------
major_type_map = {
    "cell_type_major_0": [0, 1, 2, 3, 7, 8, 13],
    "cell_type_major_1": [5, 6],
    "cell_type_major_2": [4, 10, 11],
    "cell_type_major_3": [9, 12]
}

# -------------------------------------------
# Step: Invert mapping (cluster → major type)
# -------------------------------------------
cluster_to_major = {}
for major_name, clusters in major_type_map.items():
    for cl in clusters:
        cluster_to_major[str(cl)] = major_name  # Leiden clusters are stored as strings

# --------------------------------------------
# Step: Add major type annotation to adata.obs
# --------------------------------------------
adata.obs["cell_type_major"] = adata.obs["leiden"].map(cluster_to_major)

# ------------------------------
# Step: Print annotation summary
# ------------------------------
for cl in sorted(cluster_to_major.keys(), key=lambda x: int(x)):
    major = cluster_to_major[cl]
    count = (adata.obs["leiden"] == cl).sum()
    print(f"Cluster {cl}: {major} ({count} cells)")

# --------------------------
# Step: Save annotated adata
# --------------------------
adata.write("adata_after_annotation.h5ad", compression='gzip')
print("Save annotated adata. *adata.raw contains log1p normalized counts")

### Visualization

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt

# -------------------------------
# Option: choose annotation level
# -------------------------------
category_key = "cell_type"
# category_key = "cell_type_major"

# -----------------------------------
# Step: Build color map automatically
# -----------------------------------
if category_key not in adata.obs.columns:
    raise ValueError(f"Column '{category_key}' not found in adata.obs")

adata.obs[category_key] = adata.obs[category_key].astype("category")
categories = adata.obs[category_key].cat.categories

palette = sc.pl.palettes.default_20
if len(categories) > len(palette):
    palette = sc.pl.palettes.default_102

color_map = dict(zip(categories,palette[:len(categories)]))

# ------------------------------------
# Plot: UMAP with cell type annotation
# ------------------------------------
sc.set_figure_params(figsize=(8, 8), dpi=100, facecolor="white")

sc.pl.umap(
    adata,
    color=category_key,
    title=f"UMAP: {category_key.replace('_', ' ').title()}",
    palette=color_map,
    legend_fontsize=12,
    size=10,
    frameon=False
)

plt.tight_layout()
plt.show()

### Subtype Annotation

In [None]:
import scanpy as sc
import scanpy.external as sce
import matplotlib.pyplot as plt

# ------------------------------------------
# Step: Select clusters for subtype analysis
# ------------------------------------------
clusters_to_keep = ['0', '1', '2', '3']

adata_subtype = adata[adata.obs['leiden'].isin(clusters_to_keep)].copy()

print(f"Original shape: {adata.shape}")
print(f"Filtered shape (subtype): {adata_subtype.shape}")

# ---------------------------------
# Step: Visualize selected clusters
# ---------------------------------
sc.pl.umap(
    adata_subtype,
    color='leiden',
    title='UMAP of Selected Clusters',
    size=10,
    frameon=False
)

# Downstream subtype workflow such as PCA → Harmony → Leiden → Annotation will follow,
# but is omitted here for clarity.