# Thymus ageing atlas: Differential gene expression analysis

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import hdf5plugin

import warnings
warnings.filterwarnings('ignore', category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize

## Load data

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno, how = 'left')

# Filter data (only include annotated cells)
adata = adata[adata.obs['anno_status'] == 'include',:]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
adata.obs.groupby('age_group').agg(n_cells = ('age_group', 'count'),
                                    n_donors = ('donor', 'nunique'),
                                    n_samples = ('sample', 'nunique'))  

In [None]:
adata.obs_names.duplicated().sum()

In [None]:
adata.obs['taa_l4'].isna().sum()

In [None]:
# Define columns
col_age_group = 'age_group'
col_cell_type = 'taa_l4'
col_cell_type_fine_levels = get_ct_levels(col_cell_type, include_ct = adata.obs[col_cell_type].unique().tolist())

np.array(col_cell_type_fine_levels)

##  Differential gene expression ananlysis (DESeq2)

### Age effect across ages

In [None]:
from utils import aggClusters, add_batch_pca

factors = [col_age_group, 'sex']
#contrast = [col_age_group, 'adult', 'paed']

# Remove cells with no age group 
adata_dea = adata[~adata.obs[col_age_group].isna()].copy()

# Add metadata columns to adata
adata_dea.obs['agg_key'] = [c + "__" + d for c,d in zip(adata_dea.obs[col_cell_type].astype(str), adata_dea.obs['donor'].astype(str))]

# Aggregate across clusters and complete metadata
agg_adata = aggClusters(adata_dea, lognorm=None,cluster_key='agg_key',raw = 'X', preserve_meta=['donor', col_age_group, 'sex', 'age', 'age_months', 'study'])
agg_adata.obs[col_cell_type] = [s.split("__")[0] for s in agg_adata.obs_names]

# Remove pseudobulks with too few cells (n < 10)
agg_adata = agg_adata[agg_adata.obs.n_cells >= 10,:]
agg_adata = agg_adata[~pd.isna(agg_adata.obs[col_age_group]),:]

# Add PCs significanlty associated with study for batch correction
add_batch_pca(agg_adata, batch_col='sex', interest_col=col_age_group, variance_explained=0.7, n_pcs = None)

In [None]:
# Save anndata
from scipy.sparse import csr_matrix
agg_adata.X = csr_matrix(agg_adata.X)

agg_adata.obs['age_months'] = agg_adata.obs['age_months'].astype(float)

agg_adata.write_h5ad(
            f'{general_data_path}/analyses/dea/thyAgeing_all_{object_version}_{col_cell_type}_aggAdata.zarr',
            compression=hdf5plugin.FILTERS["zstd"],
            compression_opts=hdf5plugin.Zstd(clevel=5).filter_options
            )

In [None]:
import subprocess

# Define columns
col_age_group = 'age_group'
col_cell_type = 'taa_l4'

object_version = 'v5_2025-04-03'
agg_data_path = f'{general_data_path}/analyses/dea/thyAgeing_all_{object_version}_{col_cell_type}_aggAdata.zarr'
contrasts_list = [[col_age_group, 'adult', 'paed'],
                  [col_age_group, 'paed', 'infant'],
                  [col_age_group, 'adult', 'infant'],
                  [col_age_group, 'aged', 'adult']]

for contrast in contrasts_list:
    group = contrast[0]
    test_level = contrast[1]
    ref_level = contrast[2]
    
    # Create the command to run the Python function
    command = (
        f'source /nfs/users/nfs_l/lm25/.bashrc ; conda activate /nfs/team205/lm25/condaEnvs/thymusAgeing ;'
        f'python /nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/analyses/deseq2.py --agg_adata_path {agg_data_path} --contrast {group},{test_level},{ref_level} --col_cell_type {col_cell_type} --col_age_group {col_age_group} '
    )

    # Submit the command as an LSF job
    subprocess.run([
        'bsub',
        '-q', 'normal',
        '-G', 'team361',
        '-J', f'thyAgeing_dea_{group}_{test_level}_vs_{ref_level}',
        '-o', f'{general_data_path}/analyses/dea/logs/{group}_{test_level}_vs_{ref_level}_%J.out',
        '-e', f'{general_data_path}/analyses/dea/logs/{group}_{test_level}_vs_{ref_level}_%J.err',
        "-M50000",
        "-R", "span[hosts=1] select[mem>50000] rusage[mem=50000]",
        '-W', '03:00', 
        f" eval {command}"
    ])

Below code chunk used for settin up `/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/analyses/deseq2.py` :

In [None]:
# import pickle as pkl

# from pydeseq2.dds import DeseqDataSet
# from pydeseq2.default_inference import DefaultInference
# from pydeseq2.ds import DeseqStats

# from adjustText import adjust_text
# from matplotlib.backends.backend_pdf import PdfPages

# contrast = [col_age_group, 'adult', 'paed']

# # Exclude cell types with too few replicates
# cells_per_group = agg_adata.obs.groupby([col_cell_type, col_age_group]).size().to_frame('n_cells').reset_index().pivot(index=col_cell_type, columns=col_age_group, values='n_cells').fillna(0)

# min_rep = 3
# excluded_ct = cells_per_group.loc[(cells_per_group[contrast[1]] < min_rep) | (cells_per_group[contrast[2]] < min_rep)].index

# if len(excluded_ct) > 0:
#     print(f'Excluding {len(excluded_ct)} cell types with less than {min_rep} replicates: {excluded_ct.tolist()}.')
#     agg_adata = agg_adata[~agg_adata.obs[col_cell_type].isin(excluded_ct),:]

# contrast_mod = [c.replace('-', '_') for c in contrast]
# dea_res_dict_age = {}
# dea_res_dict_sex = {}
# for ct in agg_adata.obs[col_cell_type].unique():
    
#     print(f'Running DE analysis for {ct}...')
    
#     # Preparing metadata and counts
#     metadata = agg_adata[agg_adata.obs[col_cell_type] == ct].obs[factors]
#     counts = pd.DataFrame(agg_adata[agg_adata.obs[col_cell_type] == ct].X.toarray(), index=agg_adata[agg_adata.obs[col_cell_type] == ct].obs_names, columns=agg_adata[agg_adata.obs[col_cell_type] == ct].var_names)
#     genes_to_keep = counts.columns[counts.sum(axis=0) >= 10]
#     counts = counts[genes_to_keep]
    
#     # Set up DeSeq dataset
#     dds = DeseqDataSet(
#         counts=counts,
#         metadata=metadata,
#         design = f'sex + {contrast_mod[0]}',
#         min_replicates = min_rep,
#         refit_cooks=True,
#         n_cpus = 8,
#     )
    
#     # Estimate size factors and dispersion and fitting LFC
#     dds.deseq2()
    
#     # Run DE analysis for age effect
#     dea_res = DeseqStats(dds, contrast=contrast_mod, inference=DefaultInference(n_cpus=8))
#     dea_res.summary()
#     dea_res_dict_age[ct] = dea_res.results_df
    
#     # Run DE analysis for sex effect
#     dea_res = DeseqStats(dds, contrast=['sex', 'F', 'M'], inference=DefaultInference(n_cpus=8))
#     dea_res.summary()
#     dea_res_dict_sex[ct] = dea_res.results_df
    
# # Save as pickle
# with open(f'{data_path}/analyses/dea/thyAgeing_dea_{contrast[1]}_vs_{contrast[2]}_ageEffect.pkl', 'wb') as f:
#     pkl.dump(dea_res_dict_age, f)
    
# with open(f'{data_path}/analyses/dea/thyAgeing_dea_{contrast[1]}_vs_{contrast[2]}_sexEffect.pkl', 'wb') as f:
#     pkl.dump(dea_res_dict_sex, f)
    
# # deg_df = pd.concat([dea_res.results_df for k,dea_res in dea_res_dict.items()], keys=dea_res_dict.keys(), names=['cell_type']).reset_index().rename(columns = {'level_1':'gene_name'})

# # deg_df.to_csv(f'{data_path}/analyses/dea/thyAgeing_dea_{contrast[1]}_vs_{contrast[2]}.csv', index = False)

# # # Create volcano plots
# # with PdfPages(f'{plots_path}/phenoAnalysis/dea/thyAgeing_dea_{contrast[1]}_vs_{contrast[2]}_volcano.pdf') as pdf:
# #     for ct in deg_df['cell_type'].unique():
# #         sub_df = deg_df[deg_df['cell_type'] == ct].copy()
        
# #         # Create a column to indicate significant genes
# #         sub_df['is_signif'] = (sub_df['padj'] < 0.05) & (abs(sub_df['log2FoldChange']) >= 1.3)

# #         # Create the volcano plot
# #         plt.figure(figsize=(10, 8))
# #         sns.scatterplot(data=sub_df, x='log2FoldChange', y=-np.log10(sub_df['pvalue']), hue='is_signif', legend=False)

# #         texts = []
# #         for i in range(sub_df.shape[0]):
# #             if sub_df.iloc[i]['is_signif']:
# #                 texts.append(plt.text(sub_df.iloc[i]['log2FoldChange'], -np.log10(sub_df.iloc[i]['pvalue']), sub_df.iloc[i]['gene_name'], fontsize=8))
# #         adjust_text(texts, force_points=0.2, force_text=1)

# #         plt.axhline(y=-np.log10(0.05), color='grey', linestyle='--')
# #         plt.axvline(x=0, color='grey', linestyle='--')
# #         plt.xlabel('Log2 Fold Change')
# #         plt.ylabel('-Log10 p-value')
# #         plt.title(f'{ct}')
        
# #         # Save the current figure to the PDF
# #         pdf.savefig()
# #         plt.close()

#### DEG plots

In [None]:
import pickle

contrasts = ['adult_vs_infant','adult_vs_paed', 'paed_vs_infant', 'aged_vs_adult']
for c in contrasts:
    with open(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l3_{c}_ageEffect.pkl', 'rb') as f:
        deg_df = pickle.load(f)
        deg_df = pd.concat(deg_df).reset_index(names=['cell_type', 'gene_name'])
        deg_df.to_csv(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l3_{c}_ageEffect.csv', index=False)
        
    with open(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l4_{c}_ageEffect.pkl', 'rb') as f:
        deg_df = pickle.load(f)
        deg_df = pd.concat(deg_df).reset_index(names=['cell_type', 'gene_name'])
        deg_df.to_csv(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l4_{c}_ageEffect.csv', index=False)

In [None]:
import pickle

with open(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l4_adult_vs_infant_ageEffect.pkl', 'rb') as f:
    deg = pickle.load(f)

deg_df = pd.concat(deg).reset_index(names=['cell_type','gene_name'])

In [None]:
deg_df.loc[deg_df['gene_name'] == 'CEBPA'].sort_values(by='log2FoldChange', ascending=False)

In [None]:
contrasts = ['adult_vs_infant','adult_vs_paed', 'paed_vs_infant', 'aged_vs_adult']
contrasts.reverse()
all_degs = {k:pd.read_csv(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l3_{k}_ageEffect.csv') for k in contrasts}
all_degs = pd.concat(all_degs).reset_index(names=['contrast','remove']).drop(columns='remove')

# Define columns
col_age_group = 'age_group'
col_cell_type = 'taa_l3'
col_cell_type_fine_levels = get_ct_levels(col_cell_type, include_ct = all_degs['cell_type'].unique().tolist())

np.array(col_cell_type_fine_levels)

all_degs = all_degs.loc[(all_degs['padj'] < 0.05) & (all_degs['log2FoldChange'].abs() >= 1.3)]
all_degs['cell_type'] = pd.Categorical(all_degs['cell_type'], categories=col_cell_type_fine_levels, ordered=True)
all_degs['up_down'] = ['up' if lfc > 0 else 'down' for lfc in all_degs['log2FoldChange']]

all_degs

In [None]:
# Plot number of DEGs by cell type
df = all_degs.groupby(['contrast','cell_type'], observed=True).size().to_frame('n_genes').reset_index()
df['contrast'] = df['contrast'].str.replace('geriatric', 'aged')


g = sns.catplot(data=df, x="cell_type", y="n_genes", hue="contrast", kind="bar", hue_order=['paed_vs_infant','adult_vs_paed', 'aged_vs_adult'],
                height=5, aspect=2)
g.set_xticklabels(rotation=90)
g.set(yscale="log")

g.set_xlabels('Cell type')
g.set_ylabels('Number of DEGs')
g._legend.set_title('Contrast')

g.tight_layout()
#plt.savefig(f'{plots_path}/phenoAnalysis/dea/thyAgeing_nDegs_by_cell_type.pdf', dpi = 300, bbox_inches = 'tight')

#### GSEA

In [None]:
# Load DEA results
col_age_group = 'age_group'
contrast = [col_age_group, 'adult', 'paed']
col_cell_type = 'taa_l4'

pkl_path = f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_{contrast[1]}_vs_{contrast[2]}_ageEffect.pkl'

df = pd.read_pickle(pkl_path)

df

In [None]:
import gseapy
import pickle 

# Retrieve human genesets from MsigDb
msig = gseapy.Msigdb()
gmt_c2_reactome = msig.get_gmt(category='c2.cp.reactome', dbver="2023.2.Hs")
gmt_c2_biocarta = msig.get_gmt(category='c2.cp.biocarta', dbver="2023.2.Hs")
gmt_c3_tft = msig.get_gmt(category='c3.tft', dbver="2023.2.Hs")
gmt_c7_immunesigdb = msig.get_gmt(category='c7.immunesigdb', dbver="2023.2.Hs")

# Specify run IDs and geneset dict
geneset_dict = {'c2_reactome': gmt_c2_reactome,
                'c2_biocarta': gmt_c2_biocarta,
                'c3_tft' : gmt_c3_tft,
                'c7_immunesigdb': gmt_c7_immunesigdb
                }

# Populate gsea dict by running GSEA on all DEA runs and genesets
# Rank genes and determine ties
all_res = {}
for ct, dea_res in df.items() :
    dea_res = dea_res.reset_index(names='gene_name')
    ranked_genes = dea_res[['gene_name', 'log2FoldChange']].set_index('gene_name')
    ties = 1- np.unique(ranked_genes['log2FoldChange']).shape[0]/ranked_genes.shape[0]

    # Run GSEA if ties <= 30%
    if (ranked_genes.shape[0] > 0) & (ties <= 0.3):

        all_res[ct] = {}
        for n,s in geneset_dict.items():
            
            pre_res = gseapy.prerank(rnk=ranked_genes, 
                        gene_sets=s,
                        threads=4,
                        min_size=5,
                        max_size=1000,
                        permutation_num=1000, # reduce number to speed up testing
                        outdir=None, # don't write to disk
                        seed=6,
                        verbose=True, # see what's going on behind the scenes
                        )
            
            pre_res = pre_res.res2d.loc[pre_res.res2d['FWER p-val'] < .05,:]
            
            if pre_res.shape[0] > 0 :
                all_res[ct][n] = pre_res
                
all_res = {k:pd.concat(v).reset_index().rename(columns = {'level_0':'geneset'}) for k,v in all_res.items()}
with open(f'{data_path}/analyses/gsea/thyAgeing_gsea_{col_cell_type}_{contrast[1]}_vs_{contrast[2]}_ageEffect.pkl', 'wb') as file:
    pickle.dump(all_res, file)
    
pd.concat(all_res).drop(columns = 'level_1').reset_index().rename(columns = {'level_0':'cell_type'}).drop(columns = 'level_1') \
    .to_csv(f'{data_path}/analyses/gsea/thyAgeing_gsea_{col_cell_type}_{contrast[1]}_vs_{contrast[2]}_ageEffect.csv', index = False)


### Gender effect in adults

In [None]:
# Subset adata and exlude obese donor
adata = adata[(adata.obs[col_age_group].isin(['adult'])) & (~adata.obs['donor'].isin(['A66','A74']))]

adata.shape

In [None]:
# Define columns
col_age_group = 'age_group'
col_cell_type = 'taa_l4'
#col_cell_type_fine_levels = get_ct_levels(col_cell_type, include_ct = adata.obs[col_cell_type].unique().tolist())

In [None]:
adata.obs.groupby(['sex']).agg(n_cells = ('age_group', 'count'),
                                    n_donors = ('donor', 'nunique'),
                                    n_samples = ('sample', 'nunique'))

In [None]:
from utils import aggClusters, add_batch_pca

factors = ['sex']
#contrast = [col_age_group, 'adult', 'paed']

# Remove cells with no age group 
adata_dea = adata[~adata.obs[col_age_group].isna()].copy()

# Add metadata columns to adata
adata_dea.obs['agg_key'] = [c + "__" + d for c,d in zip(adata_dea.obs[col_cell_type].astype(str), adata_dea.obs['donor'].astype(str))]

# Aggregate across clusters and complete metadata
agg_adata = aggClusters(adata_dea, lognorm=None,cluster_key='agg_key',raw = 'X', preserve_meta=['donor', col_age_group, 'sex', 'age', 'age_months', 'study'])
agg_adata.obs[col_cell_type] = [s.split("__")[0] for s in agg_adata.obs_names]

# Remove pseudobulks with too few cells (n < 10)
agg_adata = agg_adata[agg_adata.obs.n_cells >= 10,:]
agg_adata = agg_adata[~pd.isna(agg_adata.obs[col_age_group]),:]

In [None]:
csr_matrix(np.array(agg_adata.X).astype(int))

In [None]:
agg_adata.X.shape

In [None]:
# Save anndata
from scipy.sparse import csr_matrix, issparse
import numpy as np

if not issparse(agg_adata.X):
    agg_adata.X = np.array(agg_adata.X)

agg_adata.obs['age_months'] = agg_adata.obs['age_months'].astype(float)

agg_adata.write_h5ad(
            f'{general_data_path}/analyses/dea/thyAgeing_all_{object_version}_{col_cell_type}_aggAdata_adult_woA74.zarr',
            compression=hdf5plugin.FILTERS["zstd"],
            compression_opts=hdf5plugin.Zstd(clevel=5).filter_options
            )

In [None]:
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

from adjustText import adjust_text
from matplotlib.backends.backend_pdf import PdfPages

contrast = ['sex', 'F', 'M']

# Exclude cell types with too few replicates
cells_per_group = agg_adata.obs.groupby([col_cell_type, contrast[0]]).size().to_frame('n_cells').reset_index().pivot(index=col_cell_type, columns=contrast[0], values='n_cells').fillna(0)

min_rep = 3
excluded_ct = cells_per_group.loc[(cells_per_group[contrast[1]] < min_rep) | (cells_per_group[contrast[2]] < min_rep)].index

if len(excluded_ct) > 0:
    print(f'Excluding {len(excluded_ct)} cell types with less than {min_rep} replicates: {excluded_ct.tolist()}.')
    agg_adata = agg_adata[~agg_adata.obs[col_cell_type].isin(excluded_ct),:]

contrast_mod = [c.replace('-', '_') for c in contrast]
dea_res_dict = {}
for ct in agg_adata.obs[col_cell_type].unique():
    
    print(f'Running DE analysis for {ct}...')
    
    # Preparing metadata and counts
    metadata = agg_adata[agg_adata.obs[col_cell_type] == ct].obs[factors]
    counts = pd.DataFrame(agg_adata[agg_adata.obs[col_cell_type] == ct].X.toarray(), index=agg_adata[agg_adata.obs[col_cell_type] == ct].obs_names, columns=agg_adata[agg_adata.obs[col_cell_type] == ct].var_names)
    genes_to_keep = counts.columns[counts.sum(axis=0) >= 10]
    counts = counts[genes_to_keep]
    
    # Set up DeSeq dataset
    dds = DeseqDataSet(
        counts=counts,
        metadata=metadata,
        design = f'{contrast_mod[0]}',
        min_replicates = min_rep,
        refit_cooks=True,
        n_cpus = 8,
    )
    
    # Estimate size factors and dispersion and fitting LFC
    dds.deseq2()
    
    # Run DE analysis for age effect
    dea_res = DeseqStats(dds, contrast=contrast_mod, inference=DefaultInference(n_cpus=8))
    dea_res.summary()
    dea_res_dict[ct] = dea_res.results_df
    
# Save as pickle
with open(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_{contrast[1]}_vs_{contrast[2]}_genderEffect_adult_woA74.pkl', 'wb') as f:
    pkl.dump(dea_res_dict, f)

#### Inspect DEGs

In [None]:
# Load DEA results
col_cell_type = 'taa_l4'

df = pd.read_pickle(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_F_vs_M_genderEffect_adult.pkl')

all_degs = pd.concat(df).reset_index(names = [col_cell_type,'gene_name'])
all_degs['up_down'] = ['up' if lfc > 0 else 'down' for lfc in all_degs['log2FoldChange']]
all_degs['is_signif'] = (all_degs['padj'] < 0.05) #& (abs(all_degs['log2FoldChange']) >= 1.3) -> threshold too conservative
all_degs[col_cell_type] = pd.Categorical(all_degs[col_cell_type], categories=col_cell_type_fine_levels, ordered=True)

all_degs

In [None]:
n_deg = all_degs.loc[all_degs['is_signif']].groupby([col_cell_type, 'up_down']).agg(n_genes = ('gene_name', 'nunique')).reset_index()

# Plot number of DEGs by taa_l4 with grouping by up_down
with sns.plotting_context('paper', font_scale=1.4):
    plt.figure(figsize=(10, 6))
    sns.barplot(data=n_deg, x='taa_l4', y='n_genes', hue='up_down', palette=['#0173b2', '#de8f05'])
    plt.xticks(rotation=90)
    plt.xlabel('taa_l4')
    plt.ylabel('Number of DEGs')
    plt.title('Number of DEGs by taa_l4 grouped by up_down')
    plt.legend(title='Direction', loc='upper right')
    plt.tight_layout()
    #plt.savefig(f'{plots_path}/phenoAnalysis/dea/thyAgeing_nDegs_by_taa_l4.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
all_degs.loc[all_degs['is_signif']].to_csv(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_F_vs_M_genderEffect_adult_signif.csv', index = False)

In [None]:
degs_filtered = all_degs.loc[all_degs['is_signif']]
degs_filtered

In [None]:
degs_filtered.loc[degs_filtered[col_cell_type].str.contains('TEC'), 'gene_name'].unique().tolist()

In [None]:
goi = degs_filtered.loc[degs_filtered[col_cell_type].str.contains('TEC'), 'gene_name'].unique().tolist()
adata_sub = adata[adata.obs[col_cell_type].str.contains('TEC'), goi].copy()

sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

In [None]:
from plotting.utils import calc_figsize,thyAgeing_colors
sc.pl.DotPlot(adata_sub, 
            groupby=[col_cell_type, 'sex'],
            #categories_order=anno_levels,
            var_names=goi,
            figsize = calc_figsize(width = 200, height = 80),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/dea/thyAgeing_tecSplit_scvi_{object_version}_genderDegs_dotplot.pdf') 

In [None]:
goi = degs_filtered.loc[degs_filtered[col_cell_type].str.contains('Fb'), 'gene_name'].unique().tolist()
adata_sub = adata[adata.obs[col_cell_type].str.contains('Fb'), goi].copy()

sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

from plotting.utils import calc_figsize,thyAgeing_colors
sc.pl.DotPlot(adata_sub, 
            groupby=[col_cell_type, 'sex'],
            #categories_order=anno_levels,
            var_names=goi,
            figsize = calc_figsize(width = 200, height = 80),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/dea/thyAgeing_fbSplit_scvi_{object_version}_genderDegs_dotplot.pdf') 

#### GSEA

In [None]:
# Load DEA results
col_cell_type = 'taa_l4'

df = pd.read_pickle(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_F_vs_M_genderEffect_adult.pkl')

df

GS schema: {TEST_GROUP}_vs_{REF_GROUP}_{DIR}

In [None]:
import gseapy
import pickle 

# Retrieve human genesets from MsigDb
msig = gseapy.Msigdb()
gmt_c2_reactome = msig.get_gmt(category='c2.cp.reactome', dbver="2023.2.Hs")
gmt_c2_biocarta = msig.get_gmt(category='c2.cp.biocarta', dbver="2023.2.Hs")
gmt_c3_tft = msig.get_gmt(category='c3.tft', dbver="2023.2.Hs")
gmt_c7_immunesigdb = msig.get_gmt(category='c7.immunesigdb', dbver="2023.2.Hs")

# Specify run IDs and geneset dict
geneset_dict = {'c2_reactome': gmt_c2_reactome,
                'c2_biocarta': gmt_c2_biocarta,
                'c3_tft' : gmt_c3_tft,
                'c7_immunesigdb': gmt_c7_immunesigdb
                }

# Populate gsea dict by running GSEA on all DEA runs and genesets
# Rank genes and determine ties
all_res = {}
for ct, dea_res in df.items() :
    dea_res = dea_res.reset_index(names='gene_name')
    ranked_genes = dea_res[['gene_name', 'log2FoldChange']].set_index('gene_name')
    ties = 1- np.unique(ranked_genes['log2FoldChange']).shape[0]/ranked_genes.shape[0]

    # Run GSEA if ties <= 30%
    if (ranked_genes.shape[0] > 0) & (ties <= 0.3):

        all_res[ct] = {}
        for n,s in geneset_dict.items():
            
            pre_res = gseapy.prerank(rnk=ranked_genes, 
                        gene_sets=s,
                        threads=4,
                        min_size=5,
                        max_size=1000,
                        permutation_num=1000, # reduce number to speed up testing
                        outdir=None, # don't write to disk
                        seed=6,
                        verbose=True, # see what's going on behind the scenes
                        )
            
            pre_res = pre_res.res2d.loc[pre_res.res2d['FWER p-val'] < .05,:]
            
            if pre_res.shape[0] > 0 :
                all_res[ct][n] = pre_res
                
all_res = {k:pd.concat(v).reset_index().rename(columns = {'level_0':'geneset'}) for k,v in all_res.items()}
with open(f'{data_path}/analyses/gsea/thyAgeing_gsea_{col_cell_type}_F_vs_M_genderEffect_adult.pkl', 'wb') as file:
    pickle.dump(all_res, file)
    
pd.concat(all_res).drop(columns = 'level_1').reset_index().rename(columns = {'level_0':'cell_type'}).drop(columns = 'level_1') \
    .to_csv(f'{data_path}/analyses/gsea/thyAgeing_gsea_{col_cell_type}_F_vs_M_genderEffect_adult.csv', index = False)


In [None]:
freq_genesets = gsea_adult_infant.groupby('Term').size().to_frame(name = 'n').sort_values(by = 'n', ascending = False)
freq_genesets = freq_genesets.loc[freq_genesets['n'] >= 4,:]
freq_genesets = freq_genesets.index.tolist()
len(freq_genesets)

In [None]:
gsea_adult_infant.groupby('Term').size() > 4
terms = (gsea_adult_infant.groupby('Term').size() > 4).loc[lambda x: x].index.tolist()
terms

In [None]:
import matplotlib.colors as mcolors
import textwrap
df = gsea_adult_infant.copy()
df = df.loc[df['cell_type'].isin(cell_types_recirc)]
df['geneset'] = df['geneset'].str.replace(')', ') ').str.replace('/', '/ ')
df['cell_type'] = pd.Categorical(df['cell_type'], categories=col_cell_type_fine_levels, ordered=True)
df['cell_type'].cat.remove_unused_categories(inplace=True)
df['abs_NES'] = df['NES'].abs()

# Filter according to frequency
#df = df.loc[df['Term'].isin((df.groupby('Term').size() >= 2).loc[lambda x: x].index.tolist()),:]

# Create a divergent colormap centered around 0
vmin, vmax, vcenter = df['NES'].min(), df['NES'].max(), 0
normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
cmap = sns.diverging_palette(220, 20, center="light", as_cmap=True, s=100)

xranges = df.groupby("geneset")["Term"].nunique()
xranges *= 1.1
g = sns.relplot(data=df, x='Term', y='cell_type', hue='NES', size='abs_NES',
                palette=cmap, hue_norm=normalize, height=4, aspect=4,
                col='geneset', col_order=xranges.index, legend=None,
                facet_kws={'sharey': True, 'sharex': False, 'gridspec_kws': dict(width_ratios=xranges, wspace=0.01)})

# Function to wrap text
def wrap_text(text, width=15):
    return '\n'.join(textwrap.wrap(text, width))

# Apply the text wrapping function to facet titles
g.set_titles("{col_name}", fontweight='bold')
for ax in g.axes.flat:
    title = ax.get_title()
    ax.set_title(wrap_text(title), fontweight='bold')

g.set_xticklabels(rotation=90)
g.set_xlabels('Gene')
g.set_ylabels('Cell type')
g.tight_layout()

# Add colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=normalize)
sm.set_array([])
g.figure.colorbar(sm, ax=g.axes, orientation='vertical', label='NES', pad=0.01)
plt.savefig(f'{plots_path}/phenoAnalysis/dea/thyAgeing_tCells_popularTerms_recircT_gsea_heatmap.png', dpi = 300, bbox_inches = 'tight')

In [None]:
import pickle

with open(f'{data_path}/analyses/dea/thyAgeing_dea_taa_l4_adult_vs_infant_ageEffect.pkl', 'rb') as f:
    deg = pickle.load(f)

deg = pd.concat(deg).reset_index(names=['cell_type','gene_name'])
deg.head()

In [None]:
key_surface_receptors = [
    "S1PR1",  # Sphingosine-1-phosphate receptor 1
    "KLF2",   # Kruppel-like factor 2 (transcriptional regulator of S1PR1)
    "SELL",   # L-selectin (CD62L)
    "CCR7",   # C-C chemokine receptor type 7
    "CD69",   # CD69 molecule (egress inhibitor)
    "CXCR4",  # C-X-C chemokine receptor type 4
    "PTPRC",   # CD45 (protein tyrosine phosphatase receptor type C),
    "CD38",  # CD38 molecule (involved in calcium signaling)
]
deg_sub = deg.loc[(deg['cell_type'].isin(['T_CD4_naive', 'T_CD8_naive', 'T_Treg'])) & (deg['gene_name'].isin(key_surface_receptors))]
deg_sub

| Gene Symbol | Protein                                             | Role in Thymic Egress                                                                                                                    |
| ----------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| **S1PR1**   | Sphingosine-1-phosphate receptor 1                  | **Central “exit signal” receptor** sensing S1P gradient; required for egress of mature CD4⁺ and CD8⁺ thymocytes into blood.              |
| **KLF2**    | Kruppel-like factor 2                               | Transcription factor that upregulates S1PR1 in mature thymocytes.                                                                        |
| **SELL**    | L-selectin (CD62L)                                  | Upregulated late in maturation; important for peripheral homing after egress.                                                            |
| **CCR7**    | C-C chemokine receptor type 7                       | Retains cells in medulla during final maturation; downregulation is linked with readiness to exit.                                       |
| **CD69**    | CD69 molecule                                       | *Negative regulator* of egress — transiently expressed in semi-mature SP thymocytes; downregulation is required to allow S1PR1 function. |
| **CXCR4**   | C-X-C chemokine receptor type 4                     | Retention signal for earlier stages; downregulation accompanies egress readiness.                                                        |
| **PTPRC**   | CD45 (protein tyrosine phosphatase receptor type C) | General leukocyte marker; high expression marks mature T cells.                                                                          |


In [None]:
from scipy.cluster.hierarchy import linkage, leaves_list
from plotting.utils import calc_figsize,thyAgeing_colors

df = deg_sub.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'log2FoldChange')
df_annot = deg_sub.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'padj')
df_annot = df_annot.applymap(lambda x: '*' if x < 0.05 else '')

# Perform hierarchical clustering on the columns
linkage_matrix = linkage(df.T.fillna(0), method='ward')
column_order = leaves_list(linkage_matrix)

# Perform hierarchical clustering on the rows
row_linkage_matrix = linkage(df.fillna(0), method='ward')
row_order = leaves_list(row_linkage_matrix)

# Reorder the columns of the dataframe
df = df.iloc[row_order, column_order]

# Plot the reordered heatmap
p = sns.heatmap(df, cmap=sns.blend_palette([thyAgeing_colors['teal'], 'white',thyAgeing_colors['orange']], as_cmap=True),
                center=0, vmin=-1.5, vmax=1.5, cbar_kws={'label': 'log2FC'}, xticklabels=True, yticklabels=True,
                annot=df_annot.iloc[row_order, column_order], fmt='', annot_kws={'size': 8,'weight': 'bold'},)
p.set_xlabel('Cell type')
p.set_ylabel('Gene')
p.tick_params(axis='x', rotation=90)
p.tick_params(axis='y', rotation=0)
p.figure.set_size_inches(calc_figsize(width=60, height=30))
p.figure.tight_layout(rect=[0, 0, 1, 0.95], pad = 0)
plt.savefig(f'{plots_path}/dea/thyAgeing_tCells_egressDeg_heatmap.pdf', dpi = 300, bbox_inches = 'tight')

### Gender-age interaction effect

In [None]:
from utils import aggClusters, add_batch_pca

factors = [col_age_group, 'sex']
col_cell_type = 'taa_l3'
#contrast = [col_age_group, 'adult', 'paed']

# Remove cells with no age group 
adata_dea = adata[~adata.obs[col_age_group].isna()].copy()
adata_dea.obs['cell_type'] = adata_dea.obs.apply(lambda x: x['taa_l4'] if x['taa_l1'] in ['T', 'B'] else x['taa_l3'], axis = 1)

# Add metadata columns to adata
adata_dea.obs['agg_key'] = [c + "__" + d for c,d in zip(adata_dea.obs[col_cell_type].astype(str), adata_dea.obs['donor'].astype(str))]

# Aggregate across clusters and complete metadata
agg_adata = aggClusters(adata_dea, lognorm=None,cluster_key='agg_key',raw = 'X', preserve_meta=['donor', col_age_group, 'sex', 'age', 'age_months', 'study'])
agg_adata.obs[col_cell_type] = [s.split("__")[0] for s in agg_adata.obs_names]

# Remove pseudobulks with too few cells (n < 10)
agg_adata = agg_adata[agg_adata.obs.n_cells >= 10,:]
agg_adata = agg_adata[~pd.isna(agg_adata.obs[col_age_group]),:]

# Add PCs significanlty associated with study for batch correction
add_batch_pca(agg_adata, batch_col='sex', interest_col=col_age_group, variance_explained=0.7, n_pcs = None)

In [None]:
agg_adata

In [None]:
#agg_adata = ad.read_h5ad(f'{general_data_path}/analyses/dea/thyAgeing_all_{object_version}_{col_cell_type}_aggAdata.zarr')

# Remove obese donor
agg_adata = agg_adata[(agg_adata.obs['donor'] != 'A66'),:]
# Remove aged as v few donors
agg_adata = agg_adata[agg_adata.obs['age_group'] != 'aged',:]
# Exclude paed group for now
agg_adata = agg_adata[agg_adata.obs['age_group'] != 'paed',:]

agg_adata.obs.groupby(['sex', 'age_group']).agg(n_groups = ('age_group', 'count'),
                                    n_donors = ('donor', 'nunique'))

In [None]:
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

from adjustText import adjust_text
from matplotlib.backends.backend_pdf import PdfPages

contrast = ['sex', 'F', 'M']
factors = ['age_group', 'sex']

# Exclude cell types with too few replicates
cells_per_group = agg_adata.obs.groupby([col_cell_type, 'sex', 'age_group']).size().to_frame('n_cells').reset_index()
cells_per_group = cells_per_group.pivot(index=col_cell_type, columns=['sex', 'age_group'], values='n_cells').fillna(0)

min_rep = 3
excluded_ct = cells_per_group[(cells_per_group < min_rep).any(axis=1)].index.tolist()

if len(excluded_ct) > 0:
    print(f'Excluding {len(excluded_ct)} cell types with less than {min_rep} replicates: {excluded_ct}.')
    agg_adata = agg_adata[~agg_adata.obs[col_cell_type].isin(excluded_ct),:]

contrast_mod = [c.replace('-', '_') for c in contrast]
dea_res_dict = {}
dea_res_sex_dict = {}
for ct in agg_adata.obs[col_cell_type].unique():

    print(f'Running DE analysis for {ct}...')
    
    # Preparing metadata and counts
    metadata = agg_adata[agg_adata.obs[col_cell_type] == ct].obs[factors]
    metadata['age_group'] = pd.Categorical(metadata['age_group'], categories=['infant', 'adult'], ordered=True)
    metadata['sex'] = pd.Categorical(metadata['sex'], categories=['M', 'F'], ordered=True)
    counts = pd.DataFrame(agg_adata[agg_adata.obs[col_cell_type] == ct].X.toarray(), index=agg_adata[agg_adata.obs[col_cell_type] == ct].obs_names, columns=agg_adata[agg_adata.obs[col_cell_type] == ct].var_names)
    genes_to_keep = counts.columns[counts.sum(axis=0) >= 10]
    counts = counts[genes_to_keep]
    
    # Set up DeSeq dataset
    dds = DeseqDataSet(
        counts=counts,
        metadata=metadata,
        design = f'sex + age_group + sex:age_group',
        min_replicates = min_rep,
        refit_cooks=True,
        n_cpus = 8,
    )
    
    # Estimate size factors and dispersion and fitting LFC
    dds.deseq2()
    
    # Run DE analysis for age:gender effect (infant->adult;M->F)
    # Note: contrast is set to np.array([0,0,0,0,0,0,1]) to test the interaction term
    # This means we are testing the difference in log2 fold change between the two groups (infant vs adult) for males and females
    dea_res = DeseqStats(dds, contrast=np.array([0,0,0,1]), inference=DefaultInference(n_cpus=8))
    dea_res.summary()
    dea_res_dict[ct] = dea_res.results_df
    
    dea_res_sex = DeseqStats(dds, contrast=['sex', 'F', 'M'], inference=DefaultInference(n_cpus=8))
    dea_res_sex.summary()
    dea_res_sex_dict[ct] = dea_res_sex.results_df

# Save as pickle
with open(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_genderAgeEffect.pkl', 'wb') as f:
    pkl.dump(dea_res_dict, f)
    
with open(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_genderEffect.pkl', 'wb') as f:
    pkl.dump(dea_res_sex_dict, f)

Gender-age interaction effect:

In [None]:
import pickle

deg_path = f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_genderAgeEffect.pkl'
with open(deg_path, 'rb') as f:
    deg = pickle.load(f)

deg_df = pd.concat(deg).reset_index(names=['cell_type','gene_name'])
deg_df['up_down'] = ['up' if lfc > 0 else 'down' for lfc in deg_df['log2FoldChange']]
deg_df['is_signif'] = (deg_df['padj'] < 0.1) 
deg_df.head()

In [None]:
deg_df[deg_df['is_signif']]

Gender baseline effect:

In [None]:
import pickle

deg_path = f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_genderEffect.pkl'
with open(deg_path, 'rb') as f:
    deg = pickle.load(f)

deg_df = pd.concat(deg).reset_index(names=['cell_type','gene_name'])
deg_df['up_down'] = ['up' if lfc > 0 else 'down' for lfc in deg_df['log2FoldChange']]
deg_df['is_signif'] = (deg_df['padj'] < 0.1) 
deg_df.head()

In [None]:
deg_df[deg_df['is_signif']]

In [None]:
deg_df[deg_df['is_signif']].to_csv(f'{data_path}/analyses/dea/thyAgeing_dea_{col_cell_type}_genderEffect_signif.csv', index = False)

## Convert to csv for supp data

In [None]:
import pickle
import pandas as pd

for file in ['thyAgeing_dea_taa_l3_adult_vs_infant_sexEffect', 'thyAgeing_dea_taa_l4_adult_vs_infant_sexEffect', 'thyAgeing_dea_taa_l4_F_vs_M_genderEffect_adult']:
    
    pkl_path = f'{data_path}analyses/dea/{file}.pkl'
    out_csv = f'{data_path}analyses/dea/{file}.csv'

    with open(pkl_path, 'rb') as f:
        obj = pickle.load(f)

    # If the object is a dict (per-celltype results) or list of dataframes, concat; otherwise assume it's a DataFrame
    if isinstance(obj, (dict, list)):
        df = pd.concat(obj).reset_index(names=['cell_type', 'gene_name'])
    else:
        df = obj

    df.to_csv(out_csv, index=False)
    print(f'Saved CSV to: {out_csv}')

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno, how = 'left')

# Filter data (only include annotated cells)
adata = adata[adata.obs['anno_status'] == 'include',:]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
meta_ms = latest_meta.loc[latest_meta['index'].isin(adata.obs['index'].unique()), :]
meta_ms.to_csv(f'{data_path}/metadata/thyAgeing_metadata_singleCell.csv', index = False)