In [5]:
import scanpy as sc, matplotlib.pyplot as plt, seaborn as sns, numpy as np, pandas as pd, os
from collections import defaultdict
from joblib import Parallel, delayed
from multiprocessing import Pool, cpu_count

In [2]:
adata = sc.read_h5ad("bipolar_raw.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 147523 × 32034
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'pANN', 'sampleid', 'age', 'gender', 'reference', 'accession', 'sample_note', 'subclass_label', 'author_cell_type', 'development_stage_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'donor_id', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'library_platform', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_pca', 'X_scVI', 'X_umap'

### Cell type markers identification (just in case)

In [5]:
sc.tl.rank_genes_groups(adata, groupby = 'cell_type', method = "wilcoxon", use_raw = False)

In [11]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
for group in groups:
    df = pd.DataFrame({
        'gene': result['names'][group],
        'logfoldchanges': result['logfoldchanges'][group],
        'pvals': result['pvals'][group],
        'pvals_adj': result['pvals_adj'][group],
        'scores': result['scores'][group],
    })
    df.to_csv(f'results/cell_type_deg_rank_gene/deg_{group}.csv', index=False)

### Age DEG oldest vs youngest

Mus musculus typically go fully mature at Week 8. Comparing it to early development stage (P17, 17 days postnatal)

In [None]:
cell_types = adata.obs["cell_type"].unique().tolist()
for cell_type in cell_types:
    adata_sub = adata[
        (adata.obs['cell_type'] == cell_type) &
        (adata.obs['age'].isin(['W8', 'P17']))
    ].copy()
    
    # Use raw counts if available
    adata_sub.X = adata_sub.raw.X
    
    # Create pseudobulk by donor within each age
    grouped_counts = {}
    meta = []
    
    for (age, donor), idx in adata_sub.obs.groupby(['age', 'donor_id']).groups.items():
        pseudobulk = np.asarray(adata_sub[idx].X.sum(axis=0)).flatten()
        sample_name = f"{donor}_{age}"
        grouped_counts[sample_name] = pseudobulk
        meta.append({'sample': sample_name, 'age': age, 'donor': donor})
    
    # Convert to DataFrame
    counts_df = pd.DataFrame(grouped_counts).T  # samples x genes
    counts_df.columns = adata_sub.var_names
    counts_df = counts_df.T  # genes x samples
    
    meta_df = pd.DataFrame(meta)
    
    # Save
    counts_df.to_csv(f"results/age_deg_edgeR/counts/counts_{cell_type}.csv")
    meta_df.to_csv(f"results/age_deg_edgeR/meta_data/meta_{cell_type}.csv", index=False)

In [8]:
cell_types = adata.obs["cell_type"].unique().tolist()
def process_cell_type(cell_type):
    adata_sub = adata[
        (adata.obs['cell_type'] == cell_type) &
        (adata.obs['age'].isin(['W8', 'P17']))
    ].copy()

    # Use raw counts if available
    adata_sub.X = adata_sub.raw.X

    grouped_counts = {}
    meta = []

    for (age, donor), idx in adata_sub.obs.groupby(['age', 'donor_id'], observed = True).groups.items():
        pseudobulk = np.asarray(adata_sub[idx].X.sum(axis=0)).flatten()
        sample_name = f"{donor}_{age}"
        grouped_counts[sample_name] = pseudobulk
        meta.append({'sample': sample_name, 'age': age, 'donor': donor})

    counts_df = pd.DataFrame(grouped_counts).T  # samples x genes
    counts_df.columns = adata_sub.var_names
    counts_df = counts_df.T  # genes x samples

    meta_df = pd.DataFrame(meta)

    # Save
    counts_df.to_csv(f"results/age_deg_edgeR/counts/counts_{cell_type}.csv")
    meta_df.to_csv(f"results/age_deg_edgeR/meta_data/meta_{cell_type}.csv", index=False)

    return f"✅ Finished {cell_type}"

with Pool(processes=min(cpu_count(), len(cell_types))) as pool:
        results = pool.map(process_cell_type, cell_types)
print("\n".join(results))

✅ Finished rod bipolar cell
✅ Finished type 6 cone bipolar cell (sensu Mus)
✅ Finished type 5a cone bipolar cell
✅ Finished type 3b cone bipolar cell
✅ Finished type 5 cone bipolar cell (sensu Mus)
✅ Finished type 5b cone bipolar cell
✅ Finished type 9 cone bipolar cell (sensu Mus)
✅ Finished type 3a cone bipolar cell
✅ Finished type 8 cone bipolar cell (sensu Mus)
✅ Finished type 7 cone bipolar cell (sensu Mus)
✅ Finished type 4 cone bipolar cell (sensu Mus)
✅ Finished type 1 cone bipolar cell (sensu Mus)
✅ Finished type 2 cone bipolar cell (sensu Mus)


In [9]:
cell_types = adata.obs["cell_type"].unique().tolist()

In [10]:
cell_types

['rod bipolar cell',
 'type 6 cone bipolar cell (sensu Mus)',
 'type 5a cone bipolar cell',
 'type 3b cone bipolar cell',
 'type 5 cone bipolar cell (sensu Mus)',
 'type 5b cone bipolar cell',
 'type 9 cone bipolar cell (sensu Mus)',
 'type 3a cone bipolar cell',
 'type 8 cone bipolar cell (sensu Mus)',
 'type 7 cone bipolar cell (sensu Mus)',
 'type 4 cone bipolar cell (sensu Mus)',
 'type 1 cone bipolar cell (sensu Mus)',
 'type 2 cone bipolar cell (sensu Mus)']

### Save all expressed genes as background

In [16]:
raw_counts = adata.raw.X
gene_names = adata.raw.var_names

# If sparse, convert row sums properly
if hasattr(raw_counts, "sum"):  # sparse matrix
    gene_sums = np.array(raw_counts.sum(axis=0)).flatten()
else:  # dense
    gene_sums = raw_counts.sum(axis=0)

# Background genes = expressed in at least 1 cell
bkgnd_genes = gene_names[gene_sums > 0].tolist()
bkgnd_df = pd.DataFrame({"expressed_genes":bkgnd_genes})
bkgnd_df.to_csv("results/expressed_genes.csv", index = False)