# environment

In [None]:
import math
import matplotlib
import os
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib import rcParams
from matplotlib.pyplot import rc_context

# Working directory for single-cell data
sc_path = '/data1/CSJ/single_cell/'

# Set default figure save directory to current working directory
sc.settings.figdir = "."

# Print Scanpy version
print("Scanpy version:", sc.__version__)

In [None]:
sc.set_figure_params(dpi=300, color_map="magma",fontsize=10,
                     dpi_save=300,format='png',vector_friendly=True)

from matplotlib.colors import LinearSegmentedColormap

# Define the colors
colors = [(0.83, 0.83, 0.83), (1, 0, 0)]  # LightGrey and Red in normalized RGB
colors = [(0.83, 0.83, 0.83), (229/255, 127/255, 123/255)] 
# Create the colormap
cmap_name = 'lightgrey_to_red'
custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors)

from matplotlib.colors import ListedColormap

# Get the colors from the tab10 and tab20 colormaps
tab20 = plt.get_cmap('tab20').colors
tab20c = plt.get_cmap('tab20c').colors

# Combine the two colormaps
combined_color = tab20 + tab20c

plt.ioff()
import mygene

In [None]:
# New combined color palette

tab10 = plt.get_cmap('tab10').colors
tab20 = plt.get_cmap('tab20').colors
tab20b = plt.get_cmap('tab20b').colors
tab20c = plt.get_cmap('tab20c').colors
Set3 = plt.get_cmap('Set3').colors
Pastel1 = plt.get_cmap('Pastel1').colors
Pastel2 = plt.get_cmap('Pastel2').colors

# Combine these colormaps
combined_colors3 = tab20 + tab20b + Set3 + Pastel1 + Pastel2 + tab20c

# Remove duplicates and assign
unique_colors = list(dict.fromkeys(tuple(color) for color in combined_colors3).keys())
combined_colors = unique_colors

# difference function

In [None]:
def perform_targeted_de_analysis_corrected(
    adata, 
    genes_of_interest, 
    cell_type_col='cell_type',
    gene_name_column='feature_name',  # Column containing gene names
    method='wilcoxon',
    use_raw=False,
    output_dir=None
):
    """
    Corrected version of targeted differential expression analysis with proper gene name mapping.
    """
    # Parameter validation
    if cell_type_col not in adata.obs.columns:
        raise ValueError(f"Column '{cell_type_col}' not found in adata.obs")
    
    if gene_name_column not in adata.var.columns:
        raise ValueError(f"Gene name column '{gene_name_column}' not found in adata.var")
    
    if not genes_of_interest:
        raise ValueError("genes_of_interest cannot be empty")
    
    print("Mapping gene names to ENSEMBL IDs using feature_name column...")
    
    # Find matching genes by name
    matching_genes = adata.var[adata.var[gene_name_column].isin(genes_of_interest)]
    
    if matching_genes.empty:
        raise ValueError("No matching genes found")
    
    # Get ENSEMBL IDs and corresponding gene names
    ensembl_ids = matching_genes.index.tolist()
    gene_names = matching_genes[gene_name_column].tolist()
    
    print(f"Found {len(ensembl_ids)} matching genes:")
    for ensembl_id, gene_name in zip(ensembl_ids, gene_names):
        print(f"  {gene_name} -> {ensembl_id}")
    
    # Check which ENSEMBL IDs exist in the expression matrix
    existing_ids = [idx for idx in ensembl_ids if idx in adata.var_names]
    
    if len(existing_ids) != len(ensembl_ids):
        missing_ids = set(ensembl_ids) - set(existing_ids)
        print(f"Warning: The following ENSEMBL IDs are not present in the matrix: {missing_ids}")
    
    if not existing_ids:
        raise ValueError("No genes are present in the expression matrix")
    
    # Create bidirectional mappings
    gene_name_to_ensembl = dict(zip(gene_names, ensembl_ids))
    ensembl_to_gene_name = dict(zip(ensembl_ids, gene_names))
    
    print(f"Performing differential expression analysis on {len(existing_ids)} genes...")
    
    # Subset adata to the matching genes
    adata_subset = adata[:, existing_ids].copy()
    
    # Run rank_genes_groups
    sc.tl.rank_genes_groups(
        adata_subset, 
        groupby=cell_type_col, 
        method=method,
        use_raw=use_raw,
        pts=True
    )
    
    # Collect results
    group_labels = adata_subset.obs[cell_type_col].unique()
    results_list = []
    
    for group_label in group_labels:
        try:
            group_df = sc.get.rank_genes_groups_df(adata_subset, group=group_label)
            if not group_df.empty:
                group_df = group_df.copy()
                group_df['cell_type'] = group_label
                # Map back to gene names
                group_df['gene_name'] = group_df['names'].map(ensembl_to_gene_name)
                results_list.append(group_df)
        except Exception as e:
            print(f"Error processing cell type {group_label}: {e}")
    
    # Combine results
    if results_list:
        all_results = pd.concat(results_list, ignore_index=True)
        # Reorder columns: put gene_name first
        cols = ['gene_name', 'names', 'scores', 'pvals', 'pvals_adj', 'logfoldchanges', 'cell_type']
        cols = [col for col in cols if col in all_results.columns]
        all_results = all_results[cols]
    else:
        all_results = pd.DataFrame()
        print("Warning: No differential expression results found")
    
    # Save results if output directory is provided
    if output_dir and not all_results.empty:
        from pathlib import Path
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        output_file = output_path / 'targeted_differential_expression.csv'
        all_results.to_csv(output_file, index=False)
        print(f"Results saved to: {output_file}")
    
    # Print summary statistics
    print(f"\nAnalysis completed!")
    print(f"Number of genes analyzed: {len(existing_ids)}")
    
    if not all_results.empty:
        print(f"Total differential expression results: {len(all_results)}")
        print(f"Cell types involved: {all_results['cell_type'].nunique()}")
        
        # Show significant results (adjusted p < 0.05)
        significant_results = all_results[all_results['pvals_adj'] < 0.05]
        if not significant_results.empty:
            print(f"\nSignificant differentially expressed genes (p_adj < 0.05): {len(significant_results)}")
            # Sort by absolute log fold change
            significant_results = significant_results.reindex(
                significant_results['logfoldchanges'].abs().sort_values(ascending=False).index
            )
            print(significant_results.head(10))
        else:
            print("No significantly differentially expressed genes found (p_adj < 0.05)")
    else:
        print("No differential expression results found")
    
    return all_results

# 01. Brain (non-neuronal)

In [None]:
# Change to the previously used results directory
os.chdir(sc_path)

# Load the processed neuron data
file1 = "./single_cell/brain-888263.h5ad"
adata_neu = sc.read_h5ad(file1)

# Create a new subdirectory for brain non-neuron analysis
newpath = './brain_non/'
if not os.path.exists(newpath):
    os.makedirs(newpath)

# Switch to the new working directory
os.chdir(newpath)

# Create a raw copy of the original AnnData object as backup
# (prevents data loss if modifications are made)
adata_raw = adata_neu.copy()
adata_neu

In [None]:
# Check cell types and rename dimensionality reduction key

print("Unique values in cell_type:")
print(adata_neu.obs['cell_type'].unique())

# Check unique values in supercluster_term
print("\nUnique values in supercluster_term:")
print(adata_neu.obs['supercluster_term'].unique())

# View available embeddings
# adata_neu.obsm.keys()

# Rename UMAP embedding key (use 'X_UMAP' or 'X_umap' depending on your data)
adata_neu.obsm['umap'] = adata_neu.obsm['X_UMAP']  # or 'X_umap' if that's the key in your object

In [None]:
genes_of_interest = ['LGALS4','GDF15','APOM','GHRL']

gene_symbol_col='feature_name'
cell_type_col='supercluster_term'

filename='dot_plot.pdf'
sc.pl.dotplot(adata_neu,gene_symbols=gene_symbol_col,var_names=genes_of_interest, groupby=cell_type_col,cmap='Blues',
             save=filename,show=False)
filename='dot_plot.png'
sc.pl.dotplot(adata_neu,gene_symbols=gene_symbol_col,var_names=genes_of_interest, groupby=cell_type_col,cmap='Blues',
             save=filename,show=False)

In [None]:
dot_obj = sc.pl.dotplot(
    adata_neu,
    gene_symbols=gene_symbol_col,
    var_names=genes_of_interest,
    groupby=cell_type_col,
    return_fig=True   # Return Figure object instead of directly showing the plot
)

# Extract the underlying data matrices
dot_data = dot_obj.dot_color_df     # Color matrix (usually mean expression)
dot_size = dot_obj.dot_size_df      # Size matrix (usually percentage of cells expressing)

# Combine into a single table
dot_table = pd.concat({
    "mean_expression": dot_data,
    "percentage_expressed": dot_size
}, axis=1)

# Save to CSV
dot_table.to_csv("dotplot_data.csv")

In [None]:
# Run the corrected function
results = perform_targeted_de_analysis_corrected(
    adata=adata_neu,
    genes_of_interest=genes_of_interest,
    cell_type_col=cell_type_col,
    gene_name_column=gene_symbol_col,  # Specify the column containing gene symbols
    output_dir='.'                     # Save directly in the current working directory
)

In [None]:
# 1. Count cells per cell type
cell_type_counts = adata_neu.obs[cell_type_col].value_counts()

# 2. Sort cell types by count (descending)
sorted_cell_types = cell_type_counts.index.tolist()

# 3. Convert cell type column to ordered categorical (largest to smallest)
adata_neu.obs[cell_type_col] = pd.Categorical(
    adata_neu.obs[cell_type_col],
    categories=sorted_cell_types,  # ordered by descending count
    ordered=True
)


# 1. Use your previously defined color palette (ensure it has enough colors)
palette = combined_colors  # the 62-color palette you defined earlier

# 2. Get sorted cell type list (by count)
cell_type_counts = adata_neu.obs[cell_type_col].value_counts()
sorted_cell_types = cell_type_counts.index.tolist()

# 3. Create color mapping dictionary
color_mapping = {}
for i, cell_type in enumerate(sorted_cell_types):
    if i < len(palette):
        # Convert RGB tuple to hex string
        hex_color = mcolors.to_hex(palette[i])
        color_mapping[cell_type] = {
            'R': palette[i][0],
            'G': palette[i][1],
            'B': palette[i][2],
            'Hex': hex_color
        }

# 4. Convert to DataFrame and save as tab-separated file
color_df = pd.DataFrame.from_dict(color_mapping, orient='index')
color_df.index.name = 'Cell_Type'
color_df.to_csv('cell_type_color_mapping.tsv', sep='\t')

In [None]:
## PDF

# cluster plot
plot_name='cluster_plot.pdf'
with rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata_neu, color=cell_type_col,gene_symbols=gene_symbol_col,save=plot_name,show=False,
              frameon=False,legend_loc="on data",
#               palette="Set1",
               title=' ',
               palette=combined_colors,
              legend_fontweight='bold',legend_fontsize=8,s=10,
    legend_fontoutline=1)
plot_name='cluster_plot2.pdf'
with rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata_neu, color=cell_type_col,gene_symbols=gene_symbol_col,save=plot_name,show=False,
              frameon=False,
#               palette="Set1",
               title=' ',
               palette=combined_colors,
              legend_fontweight='bold',legend_fontsize=8,s=10,
    legend_fontoutline=1)

#violin plot
filename='violin_plot.pdf'
with rc_context({"figure.figsize": (20, 12)}):
    sc.pl.stacked_violin(adata_neu,gene_symbols=gene_symbol_col,var_names=genes_of_interest, groupby=cell_type_col,
             save=filename,show=False,swap_axes=True)

In [None]:
## PNG

# cluster plot
plot_name='cluster_plot.png'
with rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata_neu, color=cell_type_col,gene_symbols=gene_symbol_col,save=plot_name,show=False,
              frameon=False,legend_loc="on data",
#               palette="Set1",
               title=' ',
               palette=combined_colors,
              legend_fontweight='bold',legend_fontsize=8,s=10,
    legend_fontoutline=1)
plot_name='cluster_plot2.png'
with rc_context({"figure.figsize": (10, 10)}):
    sc.pl.umap(adata_neu, color=cell_type_col,gene_symbols=gene_symbol_col,save=plot_name,show=False,
              frameon=False,
#               palette="Set1",
               title=' ',
               palette=combined_colors,
              legend_fontweight='bold',legend_fontsize=8,s=10,
    legend_fontoutline=1)

#violin plot
filename='violin_plot.png'
with rc_context({"figure.figsize": (20, 12)}):
    sc.pl.stacked_violin(adata_neu,gene_symbols=gene_symbol_col,var_names=genes_of_interest, groupby=cell_type_col,
             save=filename,show=False,swap_axes=True)

In [None]:
ncol = 4
nrow = math.ceil(len(genes_of_interest) / ncol)  # Correctly calculate number of rows

# Create a grid of subplots: nrow rows × ncol columns
fig, axs = plt.subplots(nrow, ncol, figsize=(5 * ncol, 5 * nrow))

# Flatten axes array for easy iteration (handles cases where nrow or ncol is 1)
if isinstance(axs, np.ndarray):
    axs = axs.flatten()
else:
    axs = [axs]  # Convert single Axes object to a list

# Plot UMAP for each gene of interest
for i, gene in enumerate(genes_of_interest):
    with rc_context({"figure.figsize": (5, 5)}):
        sc.pl.umap(
            adata_neu,
            color=[gene],
            ax=axs[i],
            gene_symbols=gene_symbol_col,
            frameon=False,
            show=False,
            legend_fontweight='bold',
            s=10,
            color_map=custom_cmap,
            colorbar_loc=None
        )
        # Add colorbar manually
        ax = axs[i]
        cbar = plt.colorbar(ax.collections[0], orientation='vertical', shrink=0.5, pad=0.01)

# Remove unused subplots (if any)
for j in range(len(genes_of_interest), len(axs)):
    fig.delaxes(axs[j])

# Turn off interactive mode and finalize layout
plt.ioff()
plt.tight_layout()

# Save high-resolution versions
plt.savefig('feature_plot_all.pdf', dpi=300)
plt.savefig('feature_plot_all.png', dpi=300)

In [None]:
ncol = 4
nrow = math.ceil(len(genes_of_interest) / ncol)  # Correctly calculate number of rows

# Create a grid of subplots: nrow rows × ncol columns
fig, axs = plt.subplots(nrow, ncol, figsize=(5 * ncol, 5 * nrow))

# Flatten axes array for easy iteration (handles cases where nrow or ncol is 1)
if isinstance(axs, np.ndarray):
    axs = axs.flatten()
else:
    axs = [axs]  # Convert single Axes object to a list

# Plot UMAP for each gene of interest
for i, gene in enumerate(genes_of_interest):
    with rc_context({"figure.figsize": (5, 5)}):
        sc.pl.umap(
            adata_neu,
            color=[gene],
            ax=axs[i],
            gene_symbols=gene_symbol_col,
            frameon=False,
            show=False,
            legend_fontweight='bold',
            s=10,
            color_map=custom_cmap,
            vmax='p99',           # Use 99th percentile for color scaling
            colorbar_loc=None
        )
        # Add colorbar manually
        ax = axs[i]
        cbar = plt.colorbar(ax.collections[0], orientation='vertical', shrink=0.4, pad=0.01)

# Remove unused subplots (if any)
for j in range(len(genes_of_interest), len(axs)):
    fig.delaxes(axs[j])

# Turn off interactive mode and finalize layout
plt.ioff()
plt.tight_layout()

# Save high-resolution versions
plt.savefig('feature_plot_allp99.pdf', dpi=300)
plt.savefig('feature_plot_allp99.png', dpi=300)

In [None]:
# Grouped violin plots

# Create output directory
vol_dir = "violin"
os.makedirs(vol_dir, exist_ok=True)

# Adjust figure width based on number of cell types
n_ct = adata_neu.obs[cell_type_col].nunique()
fig_wid = 0.5 * n_ct

# Save each gene's plot as both PDF and PNG
for g in genes_of_interest:
    print(f"\rProcessing gene: {g}", end="", flush=True)
    
    with rc_context({"figure.figsize": (fig_wid, 9)}):
        sc.pl.stacked_violin(
            adata_neu,
            var_names=[g],              # single gene symbol
            gene_symbols=gene_symbol_col,
            groupby=cell_type_col,
            use_raw=False,
            show=False,
            swap_axes=True,
            figsize=(fig_wid, 1)        # override size here
        )

        # Save high-quality versions
        plt.savefig(os.path.join(vol_dir, f"{g}.pdf"), bbox_inches="tight")
        plt.savefig(os.path.join(vol_dir, f"{g}.png"), dpi=300, bbox_inches="tight")
        plt.close()