# Thymus ageing atlas - T/NK compartment : Integration of mature T cells with PBMC T cells

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
from sklearn.metrics import f1_score

# import scvi
# import torch
# torch.cuda.is_available()

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

%load_ext autoreload
%autoreload 2

from annotate_ct import get_kNN_predictions
#from scvi_wrapper import run_scvi
from utils import get_latest_version,update_obs,freq_by_donor
from plotting import plot_grouped_boxplot

In [None]:
# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

### TAA

In [None]:
# Load TAA data
object_version = 'v8_2024-11-07'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr', backed = 'r')

# For multiple text columns, you can use a loop
for column in adata.obs.columns:
    if pd.api.types.is_object_dtype(adata.obs[column]):
        try:
            adata.obs[column] = adata.obs[column].str.decode('utf-8')
        except AttributeError:  # This catches columns that are not bytes type
            pass

# Add knn predictions to adata (original HTSA reference does not have uncertainties)
ct_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_tSplit_scvi_{object_version}_finalAnno.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
# Subset to mature T cells
adata = adata[adata.obs['final_anno'].str.contains('T_CD8|T_CD4|Treg|MAIT|ɣδT')].to_memory()
adata.shape

### Yoshida2021 (PBMC)

In [None]:
# Load PBMC data
adata_pbmc = ad.read_h5ad(f'{data_path}/references/Yoshida2021_pbmc.zarr', backed = 'r')
adata_pbmc = adata_pbmc[adata_pbmc.obs['yoshida_l0'].isin(['T CD8+', 'T CD4+', 'T reg', 'T g/d', 'ILC', 'MAIT'])].to_memory()

adata_pbmc.obs['yoshida_l1'].value_counts()

### DominguezConde2022 (cross-tissue)

In [None]:
adata_tissue = ad.read_h5ad(f'{data_path}/references/DominguezConde2022_crossTissue.h5ad')
adata_tissue.obs['chemistry_simple'] = adata_tissue.obs['assay'].apply(lambda x : '5GEX' if "5'" in x else '3GEX')
adata_tissue.obs['study'] = 'DominguezConde2022'
adata_tissue.obs.rename(columns = {'donor_id':'donor', 'Manually_curated_celltype' : 'crossTissue_l0'}, inplace = True)
adata_tissue.obs = adata_tissue.obs[['donor', 'crossTissue_l0', 'chemistry_simple', 'study', 'sex', 'tissue']]
adata_tissue.obs['sample'] = 'DominguezConde2022' + '_' + adata_tissue.obs['donor'].astype(str)

# Rename vars
adata_tissue.var['gene_ids'] = adata_tissue.var.index
adata_tissue.var_names = adata_tissue.var['gene_symbols']

# Get raw counts
adata_tissue.X = adata_tissue.raw.X.copy()

adata_tissue

### Concatenate

In [None]:
# Concatenate datasets
adata = adata.concatenate([adata_pbmc, adata_tissue])

adata

In [None]:
# Check whether X is counts or normalized
adata.X[:,:100].sum(axis = 1)

In [None]:
adata.obs['study'].value_counts()

In [None]:
# Remove cells from Campinoti et al. 2020
adata = adata[adata.obs['study'] != 'Campinoti2020'].copy()

## Integrate

In [None]:
object_version = f'v1_{today}'

# Run scvi
scvi_run = run_scvi(adata, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 3000, span = 0.5,
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['final_anno', 'yoshida_l1', 'crossTissue_l0'], 
                    fig_dir = f'{plots_path}/preprocessing', fig_prefix = f'thyAgeing_tSplitxTissue_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

for c in scvi_run['data'].obs.columns:
    if scvi_run['data'].obs[c].dtype == 'O':
        scvi_run['data'].obs[c] = scvi_run['data'].obs[c].astype('|S')
        
if not os.path.exists(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_tSplitxTissue_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

## Leiden clustering

In [None]:
object_version = 'v1_2024-11-29'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}.zarr')

adata

In [None]:
# Louvain clustering
res_list = [5]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
    
adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters.csv')

In [None]:
object_version = 'v1_2024-11-29'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}.zarr')

leiden_clus = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata.obs = adata.obs.join(leiden_clus)
adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

cd45_exp = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_CD45RO_CD45RA.csv', index_col = 0)
cd45_exp.index = cd45_exp.index + '-0'
adata.obs = adata.obs.join(cd45_exp)

adata

In [None]:
sc.pl.umap(adata, color = ['leiden_r5'], ncols = 2, wspace=0.5, legend_fontsize=6, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters.png', bbox_inches = 'tight')

In [None]:
adata.obs[['CD45RO', 'CD45RA']].describe()

In [None]:
sc.pl.umap(adata, color = ['CD45RO', 'CD45RA'])

## Annotations per cluster

In [None]:
df = adata.obs[['leiden_r5', 'final_anno', 'yoshida_l1', 'crossTissue_l0']].copy()
df = df.melt(id_vars = 'leiden_r5', value_name='category', var_name='anno').dropna()
df = df.groupby(['leiden_r5', 'anno'])['category'].value_counts(normalize=True).to_frame('freq').reset_index() \
    .merge(df.groupby(['leiden_r5'])['anno'].value_counts(normalize=True).to_frame('freq_anno').reset_index())
df['leiden_r5'] = df['leiden_r5'].astype(str)

df

In [None]:
with sns.plotting_context('paper', font_scale = 1.4):
    sns.set_style("whitegrid", {'grid.color': '.9'})
    plt.figure(figsize=(20, 150))  # Increase y dimension
    g = sns.FacetGrid(df, col="anno", col_wrap=3, sharex=False, sharey=False, height=16, aspect=0.4)
    g.map_dataframe(sns.scatterplot, x='category', y='leiden_r5', size='freq', hue='freq_anno', legend=True, sizes=(20, 200))
    g.set_xticklabels(rotation=90)
    g.set_axis_labels('Category', 'Leiden R5')
    g.fig.subplots_adjust(top=0.9, hspace=0.4, wspace=0.1)  # Adjust hspace and wspace for margins
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_leiden_vs_anno_dotplot.png', bbox_inches='tight', dpi=300)

## Differential abundance testing for tissue enrichment

In [None]:
adata.obs.loc[(pd.isna(adata.obs['tissue'])) & (adata.obs['study'].str.contains('Yoshida')), 'tissue'] = 'blood'
adata.obs.loc[(pd.isna(adata.obs['tissue'])), 'tissue'] = 'thymus'

In [None]:
sc.pl.umap(adata, color = 'tissue')

In [None]:
import pertpy
milo = pertpy.tl.Milo()

# Construct nhoods
mdata = milo.load(adata)
sc.pp.neighbors(mdata["rna"], use_rep="X_scVI", n_neighbors=100)
milo.make_nhoods(mdata["rna"], prop=0.1)

In [None]:
# Build nhood graph
# Count nhoods
mdata['rna'].obs['sample_donor'] = mdata['rna'].obs['tissue'].astype(str) + '_' + mdata['rna'].obs['donor'].astype(str)
mdata = milo.count_nhoods(mdata, sample_col="sample_donor")

# Create and reorder categories
# (by default, the last category is taken as the condition of interest)
mdata["rna"].obs["da_group"] = [x if x in ['blood', 'thymus'] else 'tissue' for x in mdata["rna"].obs["tissue"]]
mdata["rna"].obs["da_group"] = mdata["rna"].obs["da_group"].astype("category")
mdata["rna"].obs["da_group"] = mdata["rna"].obs["da_group"].cat.reorder_categories(['thymus',"tissue","blood"])

# Differential abundance testing
milo.da_nhoods(mdata, design="~sex+da_group")

# Build nhood graph
milo.build_nhood_graph(mdata)

In [None]:
# Plot age by nhood
sc.pl.embedding(mdata["milo"].T, "X_milo_graph",
                    color=['logFC'], cmap="PuOr",
                    size=mdata["milo"].T.obs["Nhood_size"]*0.2,neighbors_key="nhood",
                    # edge_width =
                    sort_order=False,
                    frameon=False,
                    return_fig = True,
                    show=False)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_miloNhoodGraph_tissueFC_umap.png', dpi=300, bbox_inches='tight')

In [None]:
# Determine nhood and cell enrichment
mdata['milo'].var['nhood_enrichment'] = mdata['milo'].var.apply(lambda x : 'blood' if x['logFC'] > 1.3 and x['SpatialFDR'] < .05 else 'tissue' if x['logFC'] < 1.3 and x['SpatialFDR'] < .05 else 'None', axis = 1)

nhood_vals = mdata['milo'].var['nhood_enrichment'].apply(lambda x: 1 if x == 'blood' else -1 if x == 'tissue' else 0)
cell_vals = mdata['rna'].obsm['nhoods'] * nhood_vals

nhood_threshold = 3
mdata['rna'].obs['nhood_enrichment'] = ['blood' if x > nhood_threshold else 'tissue' if x < -nhood_threshold else 'None' for x in cell_vals]

sc.pl.umap(mdata['rna'], color = 'nhood_enrichment', return_fig = True, legend_fontsize = 6, show = False)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_tissueFC_umap.png', dpi=300, bbox_inches='tight')

In [None]:
mdata.write_h5mu(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_milo.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
mdata['rna'].obs['nhood_enrichment'].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_nhoodEnrichment.csv')

## Marker expression

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Load curated markers and make dictionary
ct_markers_df = pd.read_excel(f'{data_path}/curated/curatedCellMarkers_LMM.xlsx')
ct_markers = ct_markers_df.groupby('cell_label_fine')['marker_gene'].agg(list).to_dict()

ct_groups = {'dev' : ['T_DN(P)', 'T_DN(Q)', 'T_DN(early)', 'T_DP', 'T_DP(P)', 'T_DP(Q)', 'T_ETP', 'T_⍺β(entry)'],
 'cd4' : ['T_CD4', 'T_CD4_FH', 'T_CD4_h1', 'T_CD4_h17', 'T_T(agonist)', 'T_Th17like(fetal)', 'T_Treg', 'T_Treg(diff)'],
 'cd8' : ['T_CD8', 'T_CD8_CTL', 'T_CD8_mem'],
 'innate' : ['T_ɣδT','T_CD8⍺⍺(I)', 'T_CD8⍺⍺(II)', 'T_CD8⍺⍺(NKT)','T_ILC', 'T_MAIT', 'NK', 'NKT', 'NK_CD56hi', 'NK_CD56lo', 'NK_tr']}

[k for k in ct_markers.keys() if k not in [item for sublist in ct_groups.values() for item in sublist]]

In [None]:
# Innate markers
filtered_dict = {k:list(set([g for g in v if g in adata.var_names])) for k, v in ct_markers.items() if k in ct_groups['innate']}
filtered_dict['T_αβT(entry)'] = list(set(ct_markers['T_⍺β(entry)']))
sc.pl.DotPlot(adata, 
              var_names = filtered_dict, 
            groupby = 'leiden_r5',
            mean_only_expressed=True,
            cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters_innateMarkers.png')

In [None]:
# CD8 markers
plt.rcParams.update(plt.rcParamsDefault)
filtered_dict = {k:list(set([g for g in v if g in adata.var_names])) for k, v in ct_markers.items() if k in ct_groups['cd8']}
filtered_dict['T_αβT(entry)'] = list(set(ct_markers['T_⍺β(entry)']))
filtered_dict['T_CD8_CTL'].extend(['NKG7', 'ID2', 'ITGAE', 'EOMES', 'HLA-DRB1', 'CD69', 'IFNG', 'TNF', 'GNLY', 'GZMB', 'PRF1'])
filtered_dict['T_CD8_exhausted'] = ['LAG3', 'HAVCR2', 'CTLA4', 'PDCD1']
filtered_dict['T_CD8_recirc'] = ['PRDM1', 'PTPRC']
filtered_dict['T_CD8_naive'] = ['IL7R', 'LEF1', 'FOXO1']
filtered_dict['T_CD8_prolif'] = ['MKI67']
filtered_dict['T_selection'] = ['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11']
sc.pl.DotPlot(adata, 
              var_names = filtered_dict, 
            groupby = 'leiden_r5',
            mean_only_expressed=True,
            cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters_cd8Markers.png')

In [None]:
# CD4 markers
filtered_dict = {k:list(set([g for g in v if g in adata.var_names])) for k, v in ct_markers.items() if k in ct_groups['cd4']}
filtered_dict['T_αβT(entry)'] = ct_markers['T_⍺β(entry)']
filtered_dict['T_CD4_recirc'] = ['PRDM1', 'PTPRC', 'LAG3', 'HAVCR2', 'CTLA4', 'PDCD1']
filtered_dict['T_CD4_r1_recirc'] = ['LAG3', 'IL10', 'TGFB1', 'PRDM1', 'GZMA', 'IFNG','PDCD1']
filtered_dict['T_T(agonist)'].extend(['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11'])
filtered_dict['T_CD4_FH'].extend(['CD40LG', 'PDCD1', 'BCL6', 'IL21', 'CXCL13', 'SLAMF6', 'CD200', 'MAF', 'CXCR5'])
filtered_dict['T_CD4_h1'].extend(['IFNG', 'TNF', 'TBX21', 'STAT1', 'STAT4', 'CXCR3', 'IL12RB2', 'LY6E', 'GZMK'])
filtered_dict['T_CD4_h17'].extend(['IL17A', 'IL17F', 'IL22', 'IL21', 'RORC', 'STAT3', 'AHR', 'CCR6', 'IL23R', 'CCL20'])
sc.pl.DotPlot(adata, 
              var_names = filtered_dict, 
            groupby = 'leiden_r5',
            mean_only_expressed=True,
            cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_leidenClusters_cd4Markers.png')

## Assign annotations

In [None]:
cluster_assignments = {
   'T_CD4_recirc' : [1,14,18,19,20,24,29,36,39,44,63,73], 
   'T_CD4' : [6,13,21,60,75,35],
   'T_CD4_h_recirc' : [33,40],
   'T_CD4_fh_recirc': [10,25],
   'T_CD4_r1_recirc': [77,56],
   'T_CD4_em' : [47,48,65,32,11],
   'T_CD4_act' : [61],
   
   'T_Treg' : [67,4],
   'T_Treg_recirc' : [9,58,23],
   
   'T_CD8' : [17,28,30,41],
   'T_CD8_recirc': [3,5,45,69],
   'T_CD8_cm' : [],
   'T_CD8_rm' : [12,27,34,57,59,78,62,
                 15,22,43], # Gut
   'T_CD8_em' : [55,8,38,52,74,42],
   'T_CD8_act' : [55],
   
   'T_MAIT' : [54,26],
   'ILC' : [49],
   'T_ɣδT' : [70],
   'T_ɣδT_rm' : [51,68],
   
   'NK_CD56lo': [0,46,53],
   'NK_CD56hi': [37],
   'NK_cycling' : [66],
   
   'prev' : [2,7,16],
   'remove' : [31,50,64,71,65,76,72],
}

leftover_clusters = [c for c in adata.obs['leiden_r5'].unique() if c not in [item for sublist in cluster_assignments.values() for item in sublist]]
np.array(leftover_clusters)

In [None]:
np.array(adata.obs['temp_anno'].unique())

In [None]:
# Assign temporary annotation
adata.obs['temp_anno'] = pd.NA
for anno,l in cluster_assignments.items():
    if anno == 'remove':
        adata.obs.loc[adata.obs['leiden_r5'].isin(l), 'temp_anno'] = pd.NA
    elif anno == 'prev':
        adata.obs.loc[(adata.obs['leiden_r5'].isin(l)) & (adata.obs['final_anno'].isin(list(cluster_assignments.keys()) + ['T_CD8⍺⍺(II)', 'T_CD8⍺⍺(I)', 'T_CD8⍺⍺(entry)'])), 'temp_anno'] = adata.obs['final_anno']
    else:
        adata.obs.loc[adata.obs['leiden_r5'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata, color = 'temp_anno', wspace = 0.5, return_fig = True, cmap = 'tab20')
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')

In [None]:
# Save annotations
late_anno = adata.obs[['temp_anno']].astype(str).copy()
late_anno.loc[pd.isna(late_anno['temp_anno']), 'temp_anno'] = 'remove'
late_anno.to_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_tempAnno.csv')

In [None]:
adata.obs['temp_anno'].value_counts()

In [None]:
t_markers = pd.read_excel(f'{data_path}/curated/matureT_markers.xlsx')
t_markers = t_markers.groupby('population')['gene'].agg(list).to_dict()  

sc.pl.DotPlot(adata, 
            groupby='temp_anno',
            var_names=t_markers,
            mean_only_expressed=True,
            cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitxTissue_scvi_{object_version}_tNkMarkers_dotplot.png', dpi=300, bbox_inches='tight')

sc.pl.DotPlot(adata[~adata.obs['study'].isin(['DominguezConde2022', 'Yoshida2021'])], 
            groupby='temp_anno',
            var_names=t_markers,
            mean_only_expressed=True,
            cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplit_scvi_{object_version}_tNkMarkers_dotplot.png', dpi=300, bbox_inches='tight')

## CD45RO expression

In [None]:
from typing import List
def alevin_to_anndata(library : str, index : str, path : str, isoforms_to_include : List | None = None) -> ad.AnnData:
    mat = sc.read_mtx(f'{path}/{library}/alevin/alevin/quants_mat.mtx.gz')
    var = pd.read_csv(f'{path}/{library}/alevin/alevin/quants_mat_cols.txt', header = None)[0]
    obs = pd.read_csv(f'{path}/{library}/alevin/alevin/quants_mat_rows.txt', header = None)[0]
    obs = index + '-' + obs
    adata = ad.AnnData(X = mat.X, obs = pd.DataFrame(index = obs), var = pd.DataFrame(index = var))
    adata.obs['index'] = index
    adata.obs['library'] = library
    
    if isoforms_to_include is not None:
        adata = adata[:, adata.var_names.isin(isoforms_to_include)]
        
    return adata

In [None]:
iso_meta = adata.obs[['index', 'library']].drop_duplicates()

iso_meta

In [None]:
alevin_path = '/lustre/scratch126/cellgen/team361/lm25/nf-pipelines/alevin'
alevin_adata = []
for lib, i in zip(iso_meta['library'], iso_meta['index']):
    
    if os.path.exists(f'{alevin_path}/{lib}'):
        alevin = alevin_to_anndata(library = lib, index = i, path = alevin_path, 
                                isoforms_to_include= ['PTPRC-204', 'PTPRC-215', 'PTPRC-201', 'PTPRC-214', 'PTPRC-209','PTPRC-216', 'PTPRC-210', 'PTPRC-203', 'PTPRC-202', 'PTPRC-206','PTPRC-213', 'PTPRC-207', 'PTPRC-208', 'PTPRC-205', 'PTPRC-211','PTPRC-212', 'PTPRC-217'])
        alevin_adata.append(alevin)
    else:
        print(f'No alevin data for {lib}')

In [None]:
alevin_adata = ad.concat(alevin_adata, index_unique=None, merge = 'same')

np.intersect1d(alevin_adata.obs_names, adata.obs_names).shape

In [None]:
alevin_adata.layers['counts'] = alevin_adata.X.copy()
sc.pp.normalize_total(alevin_adata, target_sum=1e4)
sc.pp.log1p(alevin_adata)
iso_exp = pd.DataFrame(alevin_adata.X.todense(), index = alevin_adata.obs_names, columns = alevin_adata.var_names)

In [None]:
iso_exp.rename(columns = {'PTPRC-201' : 'CD45RO', 'PTPRC-209' : 'CD45RA'})[['CD45RO', 'CD45RA']].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_CD45RO_CD45RA.csv')

In [None]:
adata.obs = adata.obs.drop(columns = alevin_adata.var_names).join(iso_exp)
adata.obs.rename(columns = {'PTPRC-201' : 'CD45RO', 'PTPRC-209' : 'CD45RA'}, inplace = True)

In [None]:
sc.pl.umap(adata, color = ['CD45RO', 'CD45RA'])

In [None]:
adata.obs[['CD45RO', 'CD45RA']].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_CD45RO_CD45RA.csv')

## Update TAA annotations

In [None]:
# Load TAA data
object_version = 'v8_2024-11-07'
ta_adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Add knn predictions to adata (original HTSA reference does not have uncertainties)
ct_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_finalAnnoExpr.csv', index_col = 0)
ta_adata.obs = ta_adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(ta_adata, latest_meta, on = 'index', ignore_warning = True)

ta_adata

In [None]:
late_anno = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_{object_version}_tempAnno.csv', index_col = 0)

# Create dict for updating annotations
anno_dict = late_anno.loc[late_anno.index.str.endswith('-0')].reset_index(names = 'barcode')
anno_dict['barcode'] = anno_dict['barcode'].str.replace('-0', '')
anno_dict = anno_dict.groupby('temp_anno')['barcode'].agg(list).to_dict()

anno_dict

In [None]:
ta_adata.obs['final_anno'] = ta_adata.obs['final_anno'].astype(str)
for ct,l in anno_dict.items():
    ta_adata.obs.loc[l, 'final_anno'] = ct
ta_adata.obs.loc[(ta_adata.obs['final_anno'] == 'T_CD8_act') & (ta_adata.obs['leiden_r2.5'] == 29), 'final_anno'] = 'T_Treg_recirc'
ta_adata.obs['final_anno'] = ta_adata.obs['final_anno'].astype('category')

In [None]:
ta_adata.obs['final_anno'].value_counts()

In [None]:
# Assign populations with few cells to larger populations
rename_minor_ct = {'NK_tr' : ['NK_CD56lo', 'NK_CD56hi', 'NK_cycling'],
                   'T_CD4_h_recirc' : ['T_CD4_h1_recirc'],
                   'T_ɣδT' : ['T_ɣδT_rm'],
                   'T_CD8_recirc': ['T_CD8_CM_recirc'],
                   'T_CD8_rm' : ['T_CD8_mem_recirc'],
                   'T_CD8_em' : ['T_CD8_act'],
                   'T_CD4_act' : ['T_iTreg']}

ta_adata.obs['final_anno'] = ta_adata.obs['final_anno'].astype(str)
for ct,l in rename_minor_ct.items():
    ta_adata.obs.loc[ta_adata.obs['final_anno'].isin(l), 'final_anno'] = ct
ta_adata.obs['final_anno'] = ta_adata.obs['final_anno'].astype('category')

In [None]:
# Remove nan cells
ta_adata = ta_adata[ta_adata.obs['final_anno'] != 'T_CD8_Treg'].copy()
ta_adata.obs['final_anno'].cat.remove_unused_categories(inplace = True)

sc.pl.umap(ta_adata, color = 'final_anno', return_fig = True, legend_fontsize = 6, show = False)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_finalAnno.png', dpi = 300, bbox_inches='tight')

### Add annotation levels

In [None]:
# Check if all annotations have been matched
anno_matched_levels = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v4_2025-01-22_levels.xlsx')
np.setdiff1d(ta_adata.obs['final_anno'].unique(), anno_matched_levels['orig_taa_l5'].unique())

In [None]:
# Check all cells are fully annotated across levels
all_anno = ta_adata.obs[['final_anno']].copy().reset_index(names='barcode')
all_anno = all_anno.merge(anno_matched_levels, left_on = 'final_anno', right_on = 'orig_taa_l5')

all_anno.isna().sum()

In [None]:
# Check levels
all_anno.drop(columns = 'barcode').drop_duplicates()

In [None]:
# Add annotations to adata and sanity check on UMAP
ta_adata.obs =  ta_adata.obs.join(all_anno.drop(columns = ['final_anno', 'orig_taa_l5']).set_index('barcode'))
sc.pl.umap(ta_adata, color = [c for c in ta_adata.obs.columns if 'taa' in c], ncols = 1, wspace = 0.5, legend_fontsize = 6, return_fig = True, show = False)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v6.png', dpi = 300, bbox_inches='tight') 

In [None]:
# Save annotations
all_anno.set_index('barcode').drop(columns = ['final_anno', 'orig_taa_l5']).to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v6.csv')

### Sanity check using marker gene expression

In [None]:
# Load TAA data
object_version = 'v8_2024-11-07'
ta_adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Add knn predictions to adata (original HTSA reference does not have uncertainties)
ct_anno = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v6.csv', index_col = 0)
ta_adata.obs = ta_adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(ta_adata, latest_meta, on = 'index', ignore_warning = True)

ta_adata = ta_adata[ta_adata.obs['taa_l5'].notna()].copy()

ta_adata

In [None]:
sc.pp.normalize_total(ta_adata, target_sum=1e4)
sc.pp.log1p(ta_adata)

In [None]:
anno_levels = ['T_ETP', 'T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'T_DP(P)', 'T_DP(intermediate)', 'T_DP(Q)','T_DP(Q)-stressed', 'T_αβT(entry)', 'T_Treg(agonist)',
               'T_CD8_naive', 'T_CD4_naive', 'T_Treg',
               'T_CD8_naive_recirc','T_CD8_rm','T_CD8_em', 'T_CD8_age-assoc',
                 'T_CD4_naive_recirc', 'T_CD4_act', 'T_CD4_h', 'T_CD4_fh', 'T_CD4_r1', 'T_CD4_em',
                 'T_Treg_recirc',
                 'T_CD8αα(entry)', 'T_CD8αα(I)', 'T_CD8αα(II)','T_MAIT', 'ILC','T_γδT',
                 'NK_tr', 'B_dev_thy']

anno_cat = {'dev': ['T_ETP', 'T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'T_DP(P)', 'T_DP(intermediate)', 'T_DP(Q)','T_DP(Q)-stressed', 'T_αβT(entry)', 'T_Treg(agonist)', 'T_CD8_naive', 'T_CD4_naive', 'T_Treg', 'B_dev_thy'],
            'mature': ['T_CD8_naive_recirc','T_CD8_rm','T_CD8_em', 'T_CD8_age-assoc','T_CD4_act', 'T_CD4_naive_recirc', 'T_CD4_h_recirc', 'T_CD4_fh', 'T_CD4_r1', 'T_CD4_em','T_Treg_recirc',],
            'Innate' : ['T_CD8αα(entry)', 'T_CD8αα(I)', 'T_CD8αα(II)','T_MAIT', 'ILC','T_γδT', 'NK_tr']}

np.setdiff1d(ta_adata.obs['taa_l5'].unique().astype(str), anno_levels)

In [None]:
t_markers = pd.read_excel(f'{data_path}/curated/matureT_markers.xlsx')
t_markers = t_markers.groupby('population')['gene'].agg(list).to_dict()  

sc.pl.DotPlot(ta_adata, 
            groupby='taa_l5',
            categories_order=anno_levels,
            var_names=t_markers,
            mean_only_expressed=True,
            cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v6_tNkMarkers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
sc.tl.rank_genes_groups(ta_adata, groupby='taa_l5', method='wilcoxon', n_genes=100, groups = ['T_CD8_age-assoc'])

In [None]:
sc.get.rank_genes_groups_df(ta_adata, group='T_CD8_age-assoc').to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_T_CD8_age-assoc_markers.csv')

### Sanity check on frequencies and spatial distribution

In [None]:
nhood_enrichment = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_v1_2024-11-29_nhoodEnrichment.csv', index_col=0)
nhood_enrichment = nhood_enrichment.loc[nhood_enrichment.index.str.endswith('-0')]
nhood_enrichment.index = nhood_enrichment.index.str.replace('-0', '')
ta_adata.obs = ta_adata.obs.join(nhood_enrichment)

In [None]:
df_location = ta_adata[ta_adata.obs['sort'].isin(['TOT', 'CD3P'])].obs.groupby(['donor','final_anno'])['nhood_enrichment'].value_counts(normalize=True).to_frame('freq').reset_index()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Convert 'freq' to percentage
df_location['freq'] = df_location['freq'] * 100

# Plot grouped barplot
plt.figure(figsize=(15, 8))
sns.barplot(data=df_location, x='final_anno', y='freq', hue='nhood_enrichment', order = [l for l in anno_levels if l in df_location['final_anno'].unique()], hue_order = ['blood', 'tissue', 'None'], ci='sd')
plt.xlabel('Final Annotation')
plt.ylabel('Frequency (%)')
plt.xticks(rotation=90)
plt.legend(title='Nhood Enrichment')
plt.title('Grouped Barplot of Final Annotation by Nhood Enrichment')
plt.tight_layout()
plt.savefig(f'{plots_path}/grouped_barplot_final_anno_nhood_enrichment.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
we = ad.read_h5ad('/lustre/scratch126/cellgen/team361/lm25/data/for_pr16/objects/rna/wholeEmbryo_allGEX_scvi_v3_2024-10-15.zarr', backed = 'r')

In [None]:
we[we.obs['chemistry'] == '3GEX'].obs[['embryo', 'age_in_cs', 'section']].drop_duplicates().groupby(['embryo', 'age_in_cs'], observed = True).agg(n_sections = ('section', 'count'))