# Trajectory analysis

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import hdf5plugin
import scFates as scf

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize,thyAgeing_colors,thyAgeing_greys

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno, how = 'left')

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
adata.obs['taa_l4'].value_counts()['NK_tr']

## DN(early) bottleneck

In [None]:
# Define which cell states to select
ctoi = ['B_dev_thy','T_DN(early)', 'T_DN(P)', 'pDC', 'ILC', 'NK_tr', 'T_MAIT'] # 'CMP', 'Myelocyte', 'Neutrophil', 'Promonocyte', 'Mono', 'DC1', 'DC1-prolif',
        

# Define columns
col_cell_type_fine = 'taa_l4'
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, include_ct=ctoi)
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
col_cell_type_fine_levels

In [None]:
# Subset adata
adata_sub = adata[adata.obs[col_cell_type_fine].isin(ctoi)]

# # Update metadata
# latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
# latest_meta = pd.read_excel(latest_meta_path)
# update_obs(adata_sub, latest_meta, on = 'index', ignore_warning = True)

adata_sub.obs[col_cell_type_fine].value_counts()

### Generate PCA & UMAP embeddings

In [None]:
# Remove cell cycle genes
cell_cycle_genes = [x.strip() for x in open('/nfs/team205/vk8/processed_data/regev_lab_cell_cycle_genes.txt')]
adata_sub = adata_sub[:,~adata_sub.var.index.isin(cell_cycle_genes)]

# Log-normalize
adata_sub.layers['counts'] = adata_sub.X.copy()
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

In [None]:
adata_sub.obs['study'].value_counts()

In [None]:
# Remove studies with fewer than 50 cells
min_cells = 10
study_counts = adata_sub.obs['study'].value_counts()
studies_to_keep = study_counts[study_counts >= min_cells].index
adata_sub = adata_sub[adata_sub.obs['study'].isin(studies_to_keep)]

In [None]:
# Select HVG and run PCA
sc.pp.highly_variable_genes(adata_sub, n_top_genes=500, flavor='seurat', n_bins=20, batch_key = 'study')
sc.pp.pca(adata_sub, n_comps=10, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata_sub)

In [None]:
# Perform UMAP
sc.pp.neighbors(adata_sub, n_pcs=8, n_neighbors=20, random_state=42)
sc.tl.umap(adata_sub)

with plt.rc_context({'figure.figsize' : calc_figsize(width = 70, height = 40)}):
    sc.pl.umap(adata_sub, color=[col_cell_type_fine, col_age_group], 
               ncols=2, wspace=0.3, return_fig=True, show = False, size = 1, frameon = False)
    plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_umap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
from scvi_wrapper import run_scvi
object_version = f'v1_{today}'

# Run scvi
scvi_run = run_scvi(adata_sub, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 1000, span = 0.5,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l4'], 
                    fig_dir = f'{plots_path}/phenoAnalysis/trajectories/', fig_prefix = f'thyAgeing_dnInnateSplit_scvi_{object_version}')

In [None]:
scvi_run['data'].obs['path_cellranger_arc'].dtype

In [None]:
# Save adata and scvi model
overwrite = True

for c in scvi_run['data'].obs.columns:
    if scvi_run['data'].obs[c].dtype == 'object':
        scvi_run['data'].obs[c] = scvi_run['data'].obs[c].astype(str)
    if scvi_run['data'].obs[c].dtype == 'category':
        scvi_run['data'].obs[c] = scvi_run['data'].obs[c].astype(str)
        
anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c]
if not os.path.exists(f'{data_path}/thyAgeing_dnInnateSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_dnInnateSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_dnInnateSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

### Run Palantir (diffusion mapping)

In [None]:
import palantir 
use_rep_diffusion = 'X_umap'
n_diffusion_comp = 30
dm_res = palantir.utils.run_diffusion_maps(adata_sub, pca_key=use_rep_diffusion, n_components=n_diffusion_comp)#palantir.utils.run_diffusion_maps(pd.DataFrame(adata_sub.obsm[use_rep_diffusion]), n_components=n_diffusion_comp)
plt.scatter(np.arange(n_diffusion_comp), dm_res['EigenValues'])

In [None]:
# Generate neighbor graph in multiscale diffusion space
ms_data = palantir.utils.determine_multiscale_space(adata_sub,n_eigs=11)
adata_sub.obsm["X_palantir"]= ms_data.values
sc.pp.neighbors(adata_sub,n_neighbors=20,use_rep="X_palantir")

with plt.rc_context({"figure.figsize": calc_figsize(width = 60, height = 35)}):
    sc.pl.embedding(adata_sub, basis='X_palantir', color=[col_cell_type_fine, col_age_group], legend_loc='on data', return_fig = True)
    plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_palantirEmbed_umap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
# Impute gene expression values
imputed_X = palantir.utils.run_magic_imputation(adata_sub, dm_res = dm_res)

In [None]:
ct_markers = {'ETP' : ['CD34'],
              'T lin' : ['CD3D', 'CD3E', 'NOTCH1', 'TCF7', 'PTCRA', 'RAG1', 'RAG2', 'DNTT'],
              'B lin' : ['VPREB1', 'IFITM3', 'IGLL1', 'CD79A', #'POU2AF1', 'PAX5' #-> not really expressed
                         ],
              'ILC' : ['ID2', 'ZBTB16', 'NFIL3', 'IL7R', 'KIT', 'KLRB1'],
              'NK' : ['EOMES', 'NCAM1', 'PRF1', 'KLRD1'],
              'pDC lin' : ['LILRA4', 'CLEC4C', 'RUNX2', 'TYROBP']}

In [None]:
with plt.rc_context({"figure.figsize": calc_figsize(width = 60, height = 35)}):
    sc.pl.DotPlot(adata_sub, 
                var_names=ct_markers, 
                groupby=col_cell_type_fine, 
                categories_order=col_cell_type_fine_levels,
                #figsize = calc_figsize(width = 90, height = 0),
                mean_only_expressed=True,
                cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True, n_colors=10),
                ).add_totals().savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_linMarkers_dotplot.pdf')

In [None]:
adata_sub.obs.groupby([ col_cell_type_fine, 'age_group',]).size()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine].isin(['ILC', 'T_MAIT'])].obs.groupby(['age_group','donor'], observed=True).size()

In [None]:
with plt.rc_context({"figure.figsize": calc_figsize(width = 80, height = 90)}):
    sc.pl.embedding(
        adata_sub,
        basis="X_umap",
        layer="MAGIC_imputed_data",
        color=[g for sublist in ct_markers.values() for g in sublist],
        cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True, n_colors=10),
        ncols = 5,
        frameon=False,
        return_fig=True,    
        show = False
    )
    plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_lineageMarkers_imputed_umap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
from scipy.stats import gaussian_kde

def select_high_expr_cell(adata, gene_name : str) :
    return adata[adata[:, gene_name].X.argmax()].obs_names[0]

def calculate_centroid(adata, cell_population, embedding='X_umap'):
    """
    Calculate the point of maximum density in x and y directions for a specific cell population in a given embedding 
    and return the barcode of the cell closest to this point.

    Parameters:
    adata (anndata.AnnData): The annotated data matrix.
    cell_population (str): The cell population to calculate the point of maximum density for.
    embedding (str): The embedding to use for calculating the point of maximum density. Default is 'X_umap'.

    Returns:
    str: The barcode of the cell closest to the point of maximum density.
    """

    # Filter the data for the specific cell population
    subset = adata[adata.obs[col_cell_type_fine] == cell_population]
    
    # Calculate the density
    kde = gaussian_kde(subset.obsm[embedding].T)
    density = kde(subset.obsm[embedding].T)
    
    # Find the cell with maximum density
    max_density_idx = np.argmax(density)
    
    return subset.obs_names[max_density_idx]

# Determine early cell
start_cell = select_high_expr_cell(adata_sub[adata_sub.obs[col_cell_type_fine] == 'T_DN(early)'], 'CD34')
#start_cell = calculate_centroid(adata_sub, 'T_DN(early)')

# Determine terminal states
terminal_cells = pd.Series(
    ["T_DN(P)", "B_dev_thy", "pDC", 'ILC', 'NK_tr'],
    index=[calculate_centroid(adata_sub, 'T_DN(P)'), 
           select_high_expr_cell(adata_sub[adata_sub.obs[col_cell_type_fine] == 'B_dev_thy'], 'IFITM3'), 
           select_high_expr_cell(adata_sub[adata_sub.obs[col_cell_type_fine] == 'pDC'], 'CLEC4C'),
           select_high_expr_cell(adata_sub[adata_sub.obs[col_cell_type_fine] == 'ILC'], 'KIT'),
           select_high_expr_cell(adata_sub[adata_sub.obs[col_cell_type_fine] == 'NK_tr'], 'EOMES')
        #   adata_sub.obs_names[adata_sub.obsm['X_umap'][:,0].argmax()], 
        #    adata_sub.obs_names[adata_sub.obsm['X_umap'][:,1].argmin()], 
        #    adata_sub.obs_names[adata_sub.obsm['X_umap'][:,0].argmin()]],
])

all_cells = terminal_cells.copy()
all_cells[start_cell] = 'T_DN(early)'
palantir.plot.highlight_cells_on_umap(adata_sub, all_cells)
plt.show()

In [None]:
# Run palantir
pr_res = palantir.core.run_palantir(
    adata_sub, 
    early_cell=start_cell,
    terminal_states=terminal_cells,
    knn = 20,
    num_waypoints=500, 
    use_early_cell_as_start=True,
    seed = 20,
)

In [None]:
import palantir

In [None]:
p = palantir.plot.plot_palantir_results(adata_sub, s=3, 
                                        cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True, n_colors=10))
p.set_size_inches(calc_figsize(width = 120, height = 60))
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_pseudotime_palantir_blue.pdf', bbox_inches='tight', dpi = 300)

In [None]:
# Select branches
masks = palantir.presults.select_branch_cells(adata_sub, q=.05, eps=.005)

palantir.plot.plot_branch_selection(adata_sub)
p.set_size_inches(calc_figsize(width = 100, height = 100))
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_branches_palantir.pdf', bbox_inches='tight', dpi = 300)

In [None]:
# Plot trajectories
fig, axes = plt.subplots(1,3, figsize=calc_figsize(width = 80, height = 30))
cmap = sns.blend_palette([thyAgeing_colors['teal'], thyAgeing_colors['yellow'], thyAgeing_colors['orange'], thyAgeing_colors['magenta']], as_cmap=True, n_colors=10)

# Plot for B_dev_thy
palantir.plot.plot_trajectory(
    adata_sub,
    "B_dev_thy",
    cell_color="palantir_entropy",
    n_arrows=5,
    color="black",
    scanpy_kwargs=dict(cmap=cmap),
    arrowprops=dict(arrowstyle="-|>,head_length=.05,head_width=.05"),
    ax=axes[0]
)
axes[0].set_title("B_dev_thy")

# Plot for T_DN(P)
palantir.plot.plot_trajectory(
    adata_sub,
    "T_DN(P)",
    cell_color="palantir_entropy",
    n_arrows=5,
    color="black",
    scanpy_kwargs=dict(cmap=cmap),
    arrowprops=dict(arrowstyle="-|>,head_length=.05,head_width=.05"),
    ax=axes[1]
)
axes[1].set_title("T_DN(P)")

# Plot for pDC
palantir.plot.plot_trajectory(
    adata_sub,
    "pDC",
    cell_color="palantir_entropy",
    n_arrows=5,
    color="black",
    scanpy_kwargs=dict(cmap=cmap),
    arrowprops=dict(arrowstyle="-|>,head_length=.05,head_width=.05"),
    ax=axes[2]
)
axes[2].set_title("pDC")

plt.tight_layout()
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_branches_palantir_umap.pdf', bbox_inches='tight', dpi = 300)


In [None]:
# Inspect gene trends
gene_trends = palantir.presults.compute_gene_trends(
    adata_sub,
    expression_key="MAGIC_imputed_data",
)

In [None]:
p = palantir.plot.plot_gene_trends(adata_sub, [g for sublist in ct_markers.values() for g in sublist],)
p.set_size_inches(calc_figsize(width = 60, height = 247))
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_lineageMarkers_palantir_lines.pdf', bbox_inches='tight', dpi = 300)

In [None]:
import matplotlib as mpl
from mpl_toolkits.axes_grid1 import make_axes_locatable
norm = mpl.colors.Normalize(vmin=0, vmax=2)
goi = [g for sublist in ct_markers.values() for g in sublist]
goi.remove('RAG1')
goi.remove('RAG2')
goi.remove('CD3E')
p = palantir.plot.plot_gene_trend_heatmaps(
    adata_sub, 
    genes=goi,
    scaling='none',
    gene_trend_key='gene_trends',
    cmap=sns.blend_palette(
        [thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']],
        as_cmap=True, n_colors=10
    ),
    basefigsize=calc_figsize(width=30, height=50),
    branch_names=["B_dev_thy", "T_DN(P)", "pDC"],
    norm=norm,
    cbkwargs={'label': 'log2FC', 'ticks': np.linspace(0, 2, 5), 'aspect': 30, 'pad': 0.05}
)

# Adjust colorbar height to match the plot
fig = p if isinstance(p, mpl.figure.Figure) else plt.gcf()
for ax in fig.axes:
    # Find colorbar axes
    if hasattr(ax, 'get_ylabel') and ax.get_ylabel() == 'log2FC':
        pos = ax.get_position()
        ax.set_position([pos.x0, pos.y0, pos.width, fig.axes[0].get_position().height])
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_lineageMarkers_palantir_heatmap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrgd_v6.csv', index_col = 0)
adata_sub.obs = adata_sub.obs.join(meta_tcr)

In [None]:
# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrgd_v6.csv', index_col = 0)
adata.obs = adata.obs.join(meta_tcr)

In [None]:
adata_sub.obs.drop(columns = meta_tcr.columns, inplace = True)

In [None]:
meta_tcr.loc[meta_tcr['productive_VDJ'] != 'No_contig']

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'ILC'].obs[['productive_VDJ']].value_counts()

In [None]:
meta_tcr.columns

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'ILC'].obs[['d_call_VDJ_main', 'j_call_VDJ_main']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'T_MAIT'].obs[['productive_VDJ']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'NK_tr'].obs[['d_call_VDJ_main', 'j_call_VDJ_main']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'pDC'].obs[['d_call_VDJ_main', 'j_call_VDJ_main']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'NK_tr'].obs[['productive_VDJ']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'pDC'].obs[['productive_VDJ']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'B_dev_thy'].obs[['productive_VDJ']].value_counts()

In [None]:
adata_sub[adata_sub.obs[col_cell_type_fine] == 'T_DN(P)'].obs[['productive_VDJ']].value_counts()

In [None]:
adata[adata.obs[col_cell_type_fine] == 'T_DN(Q)'].obs[['productive_VDJ']].value_counts()

In [None]:
adata[adata.obs[col_cell_type_fine] == 'T_DP(P)'].obs[['productive_VDJ']].value_counts()

In [None]:
adata[adata.obs[col_cell_type_fine] == 'T_DP(Q)'].obs[['productive_VJ']].value_counts()

### Expression of gene clusters

In [None]:
ct = 'T_DN(P)'

# Determine gene clusters
highly_variable_genes = list(set(adata_sub.var[adata_sub.var['highly_variable']].index.tolist() + goi))
communities = palantir.presults.cluster_gene_trends(adata_sub, ct, highly_variable_genes, n_neighbors=30)

palantir.plot.plot_gene_trend_clusters(adata_sub, ct)
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_geneClusters_{ct}_palantir_lineplot.pdf', bbox_inches='tight', dpi = 300)

pd.DataFrame(communities, columns=['cluster']).reset_index(names = 'gene').sort_values(by = 'cluster').to_excel(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_geneClusters_{ct}_palantir.xlsx'.format(ct), index = False, sheet_name=ct)

In [None]:
community_dict = {f'cluster_{category}': communities[communities == category].index.tolist() for category in communities.cat.categories}
import pprint
pprint.pprint(community_dict, compact=True)

### Fate probabilities in ETP

In [None]:
# Fate probabilities
df = adata.obs[['donor', 'sample', 'age_group', col_cell_type_fine]].copy()
df = df.join(adata_sub.obsm['palantir_fate_probabilities'])
df.dropna(subset = 'pDC', inplace = True)
df = df.melt(id_vars = ['donor', 'sample', 'age_group', col_cell_type_fine], var_name = 'cell_type', value_name = 'fate_prob')
df = df.groupby(['donor', 'sample', 'age_group', col_cell_type_fine, 'cell_type'], observed = True)['fate_prob'].mean().reset_index()
df = df.groupby(['donor', 'age_group', col_cell_type_fine, 'cell_type'], observed = True)['fate_prob'].mean().reset_index()
df.head()

In [None]:
from plotting.utils import plot_grouped_boxplot,get_tint_palette
df['cell_type'] = df['cell_type'].str.replace('T_', '').str.replace('_thy', '')
plot_grouped_boxplot(data = df.loc[df[col_cell_type_fine] == 'T_DN(early)'], x = 'cell_type', y = 'fate_prob', hue = col_age_group, order = ['DN(P)', 'B_dev', 'pDC'], hue_order = ['infant', 'paed', 'adult', 'aged'],
                     palette =  get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'Fate probability', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 45),
                     save_stats = f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_fateProbs_palantir_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}, ylim = (0,1),
                     )
#plt.xticks(rotation=45, ha='right')
plt.yticks([0, 0.5, 1])
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_fateProbs_palantir_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
for c in adata_sub.obs.columns:
        if pd.api.types.is_object_dtype(adata_sub.obs[c]):
                adata_sub.obs[c] = adata_sub.obs[c].astype(str)
                
adata_sub.write_h5ad(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_palantir.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
#adata_sub = ad.read_h5ad(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_dnInnateSplit_palantir_.zarr')

## scFates

In [None]:
# # Remove cell cycle genes
# cell_cycle_genes = [x.strip() for x in open('/nfs/team205/vk8/processed_data/regev_lab_cell_cycle_genes.txt')]
# adata_sub = adata_sub[:,~adata_sub.var.index.isin(cell_cycle_genes)]

# # Log-normalize
# adata_sub.layers['counts'] = adata_sub.X.copy()
# sc.pp.normalize_total(adata_sub, target_sum=1e4)
# sc.pp.log1p(adata_sub)

# # Select HVG and run PCA
# sc.pp.highly_variable_genes(adata_sub, n_top_genes=500, flavor='seurat', n_bins=20, batch_key='study')
# sc.pp.pca(adata_sub, n_comps=20, use_highly_variable=True, svd_solver='arpack')

# sc.pl.pca(adata_sub, color = [col_cell_type_fine, col_age_group], ncols=2, wspace=0.5)

# # Perform UMAP
# adata_sub.obsm['X_umap_scVI'] = adata_sub.obsm['X_umap'].copy()
# sc.pp.neighbors(adata_sub, n_pcs=9, n_neighbors=20, random_state=42)
# sc.tl.umap(adata_sub)

# sc.pl.umap(adata_sub, color=[col_cell_type_fine, col_age_group], ncols=2, wspace=0.5, return_fig=True)
# plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_lateSplit_umap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
import palantir 
use_rep_diffusion = 'X_scVI'
n_diffusion_comp = 30
dm_res = palantir.utils.run_diffusion_maps(adata_sub, pca_key=use_rep_diffusion, n_components=n_diffusion_comp)#palantir.utils.run_diffusion_maps(pd.DataFrame(adata_sub.obsm[use_rep_diffusion]), n_components=n_diffusion_comp)
plt.scatter(np.arange(n_diffusion_comp), dm_res['EigenValues'])

In [None]:
# Generate neighbor draph in multiscale diffusion space
ms_data = palantir.utils.determine_multiscale_space(adata_sub,n_eigs=10)
adata_sub.obsm["X_palantir"]= ms_data.values
sc.pp.neighbors(adata_sub,n_neighbors=20,use_rep="X_palantir")

sc.pl.embedding(adata_sub, basis='X_palantir', color=[col_cell_type_fine, col_age_group], legend_loc='on data', return_fig = True)
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_etpSplit_palantirEmbed_umap.pdf', bbox_inches='tight', dpi = 300)

In [None]:
# draw ForceAtlas2 embedding using 2 first PCs as initial positions
adata_sub.obsm["X_palantir2d"]=adata_sub.obsm["X_palantir"][:,:2]
sc.tl.draw_graph(adata_sub,init_pos='X_palantir2d', n_jobs = 64)

sc.pl.draw_graph(adata_sub,color=col_cell_type_fine,legend_loc='on data')
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_etpSplit_palantirEmbed_forceAtlas.pdf', bbox_inches='tight', dpi = 300)

In [None]:
for c in adata_sub.obs.columns:
        if pd.api.types.is_object_dtype(adata_sub.obs[c]) or isinstance(adata_sub.obs[c].dtype, pd.CategoricalDtype):
                adata_sub.obs[c] = adata_sub.obs[c].astype(str)
                
adata_sub.write_h5ad(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_etpSplit_palantir_.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
adata_sub = ad.read_h5ad(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_etpSplit_palantir_.zarr')

In [None]:
scf.tl.tree(adata_sub,method="ppt",Nodes=80,use_rep="X_palantir",
            device="cpu",seed=1,ppt_lambda=100,ppt_sigma=0.025,ppt_nsteps=200)

In [None]:
scf.pl.graph(adata_sub)

In [None]:
scf.tl.root(adata_sub,21)

In [None]:
scf.tl.pseudotime(adata_sub,n_jobs=20,n_map=100,seed=42)

In [None]:
scf.pl.trajectory(adata_sub)

In [None]:
sc.pl.draw_graph(adata_sub,color=["seg","milestones"])

In [None]:
scf.tl.rename_milestones(adata_sub,['BifA', 'BifB', 'T_Treg', 'T_αβT(entry)', 'BifC', 'T_CD8_naive','T_Treg(agonist)', 'T_CD4_naive'])

In [None]:
scf.pl.milestones(adata_sub, annotate=True, show = False)
plt.savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_milestones.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
# for c in adata_sub.obs.columns:
#         if pd.api.types.is_object_dtype(adata_sub.obs[c]) or isinstance(adata_sub.obs[c].dtype, pd.CategoricalDtype):
#                 adata_sub.obs[c] = adata_sub.obs[c].astype(str)
        
# adata_sub.write_h5ad(f'{data_path}/objects/rna/thyAgeing_lateSplit_palantir.zarr',
#                      compression=hdf5plugin.FILTERS["zstd"],
#                      compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

### Differential expression

In [None]:
# Load adata with trajectory
adata_sub = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_lateSplit_palantir.zarr')

In [None]:
# Log-normalize
adata_sub.layers['counts'] = adata_sub.X.copy()
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

In [None]:
def get_expressed_genes(adata: ad.AnnData, groupby : str, group : str, min_frac : float = 0.1) -> pd.Series:
    
    n_obs_group = (adata.obs[groupby] == group).sum()
    ecf = np.array(adata[adata.obs[groupby] == group].X.astype(bool).sum(axis=0) / n_obs_group).flatten()
    
    return list(adata.var_names[ecf > min_frac])

In [None]:
adata_sub.obs['t']

#### abT(entry) -> mature T cells

In [None]:
adata_sub_seg = adata_sub[(adata_sub.obs['seg'] == '5') & (adata_sub.obs[col_age_group].isin(['adult', 'infant']))].copy()
adata_sub_seg.obs[col_age_group] = pd.Categorical(adata_sub_seg.obs[col_age_group], categories = ['infant', 'adult'], ordered=True)
features_to_test = list(set(get_expressed_genes(adata_sub_seg, col_age_group, 'infant') + get_expressed_genes(adata_sub_seg, col_age_group, 'adult')))

print(f'Number of features to test: {len(features_to_test)}')

In [None]:
# Amplitute test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=False, fdr_cut=0.05, n_jobs=4, n_map=1)

In [None]:
seg5_degs = adata_sub_seg.var.copy()
seg5_degs['log2FC'] = - seg5_degs['infant->adult_lfc']

seg5_degs.head()

In [None]:
seg5_degs[seg5_degs['cov_fdr'] < .05].sort_values('log2FC').head(50)

In [None]:
seg5_degs[seg5_degs['cov_fdr'] < .05].sort_values('log2FC', ascending=False).head(50)

In [None]:
seg5_degs.to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_seg5_amplitudeTest.csv')

In [None]:
# Trend test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=True, fdr_cut=0.05, n_jobs=4, n_map=1)

In [None]:
seg5_trend_degs = adata_sub_seg.var.copy()
seg5_trend_degs['log2FC'] = - seg5_trend_degs['infant->adult_lfc']

seg5_trend_degs.sort_values('log2FC').head(50)

In [None]:
seg5_trend_degs.sort_values('log2FC', ascending=False).head(50)

In [None]:
seg5_trend_degs.to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_seg5_trendTest.csv')

#### Mature T cells

In [None]:
adata_sub_seg = adata_sub[(adata_sub.obs['milestones'] == 'T_CD4_naive') & (adata_sub.obs[col_age_group].isin(['adult', 'infant']))].copy()
adata_sub_seg.obs[col_age_group] = pd.Categorical(adata_sub_seg.obs[col_age_group], categories = ['infant', 'adult'], ordered=True)
features_to_test = list(set(get_expressed_genes(adata_sub_seg, col_age_group, 'infant') + get_expressed_genes(adata_sub_seg, col_age_group, 'adult')))

print(f'Number of features to test: {len(features_to_test)}')

In [None]:
# Amplitute test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=False, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD4_naive_amplitudeTest.csv')

In [None]:
# Trend test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=True, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD4_naive_trendTest.csv')

In [None]:
adata_sub_seg = adata_sub[(adata_sub.obs['milestones'] == 'T_CD8_naive') & (adata_sub.obs[col_age_group].isin(['adult', 'infant']))].copy()
adata_sub_seg.obs[col_age_group] = pd.Categorical(adata_sub_seg.obs[col_age_group], categories = ['infant', 'adult'], ordered=True)
features_to_test = list(set(get_expressed_genes(adata_sub_seg, col_age_group, 'infant') + get_expressed_genes(adata_sub_seg, col_age_group, 'adult')))

print(f'Number of features to test: {len(features_to_test)}')

# Amplitute test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=False, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD8_naive_amplitudeTest.csv')

# Trend test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=True, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD8_naive_trendTest.csv')

In [None]:
adata_sub_seg = adata_sub[(adata_sub.obs['milestones'] == 'T_Treg') & (adata_sub.obs[col_age_group].isin(['adult', 'infant']))].copy()
adata_sub_seg.obs[col_age_group] = pd.Categorical(adata_sub_seg.obs[col_age_group], categories = ['infant', 'adult'], ordered=True)
features_to_test = list(set(get_expressed_genes(adata_sub_seg, col_age_group, 'infant') + get_expressed_genes(adata_sub_seg, col_age_group, 'adult')))

print(f'Number of features to test: {len(features_to_test)}')

# Amplitute test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=False, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_Treg_amplitudeTest.csv')

# Trend test
scf.tl.test_covariate(adata_sub_seg, features = features_to_test, group_key = col_age_group, trend_test=True, fdr_cut=0.05, n_jobs=4, n_map=1)

degs = adata_sub_seg.var.copy()
degs = degs.loc[~degs['cov_signi'].isna()]
degs['log2FC'] = - degs['infant->adult_lfc']

degs.sort_values('infant->adult_lfc').to_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_Treg_trendTest.csv')

In [None]:
# Load all DEG
cd8_deg = pd.read_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD8_naive_trendTest.csv')
cd4_deg = pd.read_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_CD4_naive_trendTest.csv')
treg_deg = pd.read_csv(f'{data_path}/analyses/phenoAnalysis/trajectories/thyAgeing_lateSplit_scFates_T_Treg_trendTest.csv')

In [None]:
def intersect(*d):
    sets = iter(map(set, d))
    result = sets.next()
    for s in sets:
        result = result.intersection(s)
    return result

In [None]:
from supervenn import supervenn

In [None]:
up_sets = [set(cd8_deg.loc[cd8_deg['log2FC'] > 1]['gene_name'].tolist()), set(cd4_deg.loc[cd4_deg['log2FC'] > 1]['gene_name'].tolist()), set(treg_deg.loc[treg_deg['log2FC'] > 1]['gene_name'].tolist())]
up_p = supervenn(up_sets, set_annotations=['CD8', 'CD4', 'Treg'])

up_p

In [None]:
import pprint
pprint.pprint(up_p.chunks, compact=True)

- Hypoxia: ANKRD37
- CXCR4: CXCR4 downregulation correlates with TCR-mediated signalling in DP thymocytes, resulting in overall donwregulation in mature CD4/CD8 T cells ([Lucas, 2017](https://www.nature.com/articles/s41598-017-05182-7))

In [None]:
down_sets = [set(cd8_deg.loc[cd8_deg['log2FC'] < -1]['gene_name'].tolist()), set(cd4_deg.loc[cd4_deg['log2FC'] < -1]['gene_name'].tolist()), set(treg_deg.loc[treg_deg['log2FC'] < -1]['gene_name'].tolist())]
down_p = supervenn(down_sets, set_annotations=['CD8', 'CD4', 'Treg'])

down_p

In [None]:
pprint.pprint(down_p.chunks, compact=True)

- CD38 downregulation: CD38 is marker of RTEs ([Bohacova, 2024](https://www.cell.com/immunity/fulltext/S1074-7613(24)00418-7)) -> paper also shows age-related decline in CD38 in naive T cells in blood
- SIRPBlow CD8 T cells have lower activation threshold and are implicated in autoimmunity ([Sinha, 2018](https://www.nature.com/articles/s41598-018-33901-1))

In [None]:
import itertools
sc.pl.DotPlot(adata_sub[adata_sub.obs[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])],
              var_names = ['S1PR1', 'CD38', 'CXCR4', 'SIRPG'], groupby = [col_cell_type_fine, col_age_group], 
              categories_order= [f'{c1}_{c2}' for c1,c2 in list(itertools.product(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'], col_age_group_levels))],
              mean_only_expressed=True, 
              cmap = 'magma').add_totals().savefig(f'{plots_path}/phenoAnalysis/trajectories/thyAgeing_lateSplit_exitDegs_dotplot.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
session_info.show()