# Thymus ageing atlas - T/NK compartment : knn-transfer of TAA cell labels

In [None]:
import os
import sys
import session_info

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
from sklearn.metrics import f1_score

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Add R libs path
#os.environ['LD_LIBRARY_PATH'] = '' # Uncomment on jhub
#os.environ['R_HOME'] = '/nfs/team205/lm25/condaEnvs/thymusAgeing/lib/R' # Uncomment on jhub
os.environ['R_LIBS_USER'] = f'{os.path.split(sys.path[0])[0]}/R/library'

%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

In [None]:
# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

In [None]:
object_version = 'v8_2024-11-07'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

leiden_clus = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv', index_col=0)
if any(leiden_clus.columns.isin(adata.obs.columns)):
    adata.obs.drop(leiden_clus.columns, axis = 1, inplace = True)
adata.obs = adata.obs.join(leiden_clus)
adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

# Add celltypist predictions to adata
celltypist_predictions = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v3_2024-11-05_celltypist_taa_l1.csv', index_col=0)
adata.obs = adata.obs.join(celltypist_predictions, how = 'left')

# Add previous TAA annotations
ct_labels = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v2_2024-06-20_curatedAnno_v6.csv', index_col = 0, dtype = 'category')
adata.obs.drop(ct_labels.columns, axis = 1, errors = 'ignore', inplace = True)
adata.obs = adata.obs.join(ct_labels)

# Update metadata
from utils import get_latest_version,update_obs
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

## Plot Leiden clusters and previous anno

In [None]:
sc.pl.umap(adata, color = ['leiden_r2.5','taa_l5'], wspace = 0.5, legend_fontsize=4, ncols = 1, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

## Plot marker expression

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### General markers

In [None]:
import pickle

# Load marker data
with open(f'{general_data_path}/markers/allMarkers_lowGranularity_vk8.pkl', 'rb') as f:
    all_mrkrs = pickle.load(f)
    
# Plot markers
for k,l in all_mrkrs.items():
    sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=l,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_{object_version}_{k}Markers_dotplot.png', dpi=300, bbox_inches='tight')

### T/NK markers

In [None]:
# Load curated markers and make dictionary
ct_markers_df = pd.read_excel(f'{data_path}/curated/curatedCellMarkers_LMM.xlsx')
ct_markers = ct_markers_df.groupby('cell_label_fine')['marker_gene'].agg(list).to_dict()

import pprint
pprint.pprint(ct_markers, compact=True)

In [None]:
ct_groups = {'dev' : ['T_DN(P)', 'T_DN(Q)', 'T_DN(early)', 'T_DP', 'T_DP(P)', 'T_DP(Q)', 'T_ETP', 'T_⍺β(entry)'],
 'cd4' : ['T_CD4', 'T_CD4_FH', 'T_CD4_h1', 'T_CD4_h17', 'T_T(agonist)', 'T_Th17like(fetal)', 'T_Treg', 'T_Treg(diff)'],
 'cd8' : ['T_CD8', 'T_CD8_CTL', 'T_CD8_mem'],
 'innate' : ['T_ɣδT','T_CD8⍺⍺(I)', 'T_CD8⍺⍺(II)', 'T_CD8⍺⍺(NKT)','T_ILC', 'T_MAIT', 'NK', 'NKT', 'NK_CD56hi', 'NK_CD56lo', 'NK_tr']}

[k for k in ct_markers.keys() if k not in [item for sublist in ct_groups.values() for item in sublist]]

In [None]:
for ct,l in ct_groups.items():
    
    filtered_dict = {k:[g for g in v if g in adata.var_names] for k, v in ct_markers.items() if k in l}
    sc.pl.DotPlot(adata, 
                  var_names = filtered_dict, 
                  groupby = 'leiden_r2.5',
                  mean_only_expressed=True,
                  cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_leidenClusters_{ct}Markers.png')

## Manual annotations

In [None]:
cluster_assignments = {#'T_Bdev': [30],
                       'T_DN*': [30,18], # Subcluster
                       'T_DP(P)': [3,6,7,27,34,36,35],
                       'T_DP-intermediate' : [20],
                       'T_DP(Q)' :[5,19,13,8,9,12,14,15,22,11,33],
                       'T_DP(Q)-stressed': [24],
                       'T_late*': [2,25,26,0,1,17,28,29,10,37,31,23,21,4,16,32]}

leftover_clusters = [c for c in adata.obs['leiden_r2.5'].unique() if c not in [item for sublist in cluster_assignments.values() for item in sublist]]
leftover_clusters

In [None]:
# Assign temporary annotation
adata.obs['temp_anno'] = pd.NA
for anno,l in cluster_assignments.items():
    adata.obs.loc[adata.obs['leiden_r2.5'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')

### T_DN*

In [None]:
# Subcluster T_DN*
adata_dn = adata[adata.obs['temp_anno'].isin(['T_DN*'])].copy()

sc.tl.leiden(adata_dn, resolution = 2.0, key_added = f"leiden_r2.0")
adata_dn.obs['leiden_r2.0'] = adata_dn.obs['leiden_r2.0'].astype(int).astype('category')
adata_dn.obs[['leiden_r2.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv')

# leiden_dn = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
# adata_dn.obs = adata_dn.obs.join(leiden_dn)
# adata_dn.obs['leiden_r2.0'] = adata_dn.obs['leiden_r2.0'].astype(int).astype('category')

sc.pl.umap(adata_dn, color = ['percent_mito', 'percent_ribo', 'n_genes', 'scrublet_score'], wspace = 0.5, ncols=2, 
           return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_qc.png', dpi = 300, bbox_inches = 'tight')

In [None]:
adata_dn = adata_dn[adata_dn.obs['scrublet_score'] <= 0.5].copy()
sc.pl.umap(adata_dn, color = ['leiden_r2.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
filtered_dict = {k:list(set([g for g in v if g in adata.var_names])) for k, v in ct_markers.items() if k in ct_groups['dev']}
filtered_dict['B_dev'] = ['VPREB1', 'SPN']
sc.pl.DotPlot(adata_dn, 
            var_names = filtered_dict, 
            groupby = 'leiden_r2.0',
            mean_only_expressed=True,
            cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters_devMarkers.png')

In [None]:
dn_cluster_assignments = {'T_DN(late)': [13],
                          'T_DN(Q)': [8,12,16,17,19,21, #
                                      2,10,7,1,26,28],
                          'T_DN(P)': [4,11,14,27,15, #
                                      5,6,9,22],
                          'T_DN(early)': [18,23],
                          'T_ETP' : [20,24],
                          'T_B_dev' : [0,3,25]}

dn_leftover_clusters = [c for c in adata_dn.obs['leiden_r2.0'].unique() if c not in [item for sublist in dn_cluster_assignments.values() for item in sublist]]
dn_leftover_clusters

In [None]:
# Assign temporary annotation
adata_dn.obs['temp_anno'] = pd.NA
for anno,l in dn_cluster_assignments.items():
    adata_dn.obs.loc[adata_dn.obs['leiden_r2.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_dn, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_dn.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
sc.pl.DotPlot(adata_dn, 
            var_names = filtered_dict, 
            groupby = 'temp_anno',
            mean_only_expressed=True,
            cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}_tempAnno_devMarkers.png')

In [None]:
adata_dn.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_dnSplit_scvi_{object_version}.zarr',
                        compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

In [None]:
# Score cell cycle
cell_cycle_genes_df = pd.read_excel('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/data/curated/regev_cell_cycle_genes.xlsx', skiprows = 6, usecols=[0,1])

cell_cycle_genes = {}
for col in cell_cycle_genes_df.columns:
    cell_cycle_genes_df[col] = cell_cycle_genes_df[col].str.strip()
    cell_cycle_genes[col] = [g.strip() for g in cell_cycle_genes_df[col].dropna().tolist() if g.strip() in adata.var_names]
    
import pprint
pprint.pprint(cell_cycle_genes, compact=True)

sc.tl.score_genes_cell_cycle(adata_dn, s_genes=cell_cycle_genes['G1/S'], g2m_genes=cell_cycle_genes['G2/M'])

In [None]:
cell_cycle_score_df = adata_dn.obs[['S_score', 'G2M_score', 'donor', 'age_group', 'temp_anno', 'phase', 'study']].copy()
#cell_cycle_score_df = cell_cycle_score_df.join(knn_anno)
df = cell_cycle_score_df.groupby(['age_group', 'temp_anno', 'donor', 'study'])['phase'].value_counts(normalize = True).unstack().fillna(0).reset_index()
df = df.melt(id_vars = ['age_group', 'temp_anno', 'donor', 'study'], value_name = 'prop', var_name = 'phase')
df['temp_anno'] = pd.Categorical(df['temp_anno'], categories = ['T_ETP','T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'T_B_dev'], ordered=True)
df['temp_anno'] = df['temp_anno'].cat.remove_unused_categories()

sns.catplot(data=df, x='temp_anno', y='prop', hue = 'age_group', hue_order = ['infant', 'paed', 'adult', 'geriatric'], kind="box", palette = 'colorblind',
            col='phase', col_order=['G1', 'S','G2M'], 
            facet_kws={'sharey': False, 'sharex': True}, 
            height=2, aspect=2, col_wrap=1)

### T_late*

In [None]:
# Subcluster T_late*
adata_late = adata[adata.obs['temp_anno'].isin(['T_late*'])].copy()

# sc.tl.leiden(adata_late, resolution = 3.0, key_added = f"leiden_r3.0")
# adata_late.obs['leiden_r3.0'] = adata_late.obs['leiden_r3.0'].astype(int).astype('category')
# adata_late.obs[['leiden_r3.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_leidenClusters.csv')

leiden_late = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata_late.obs = adata_late.obs.join(leiden_late)
adata_late.obs['leiden_r3.0'] = adata_late.obs['leiden_r3.0'].astype(int).astype('category')

sc.pl.umap(adata_late, color = ['leiden_r3.0'], wspace = 0.5, )

In [None]:
adata_late.obs['leiden_group'] = adata_late.obs['leiden_r3.0'].astype(str)
sc.tl.rank_genes_groups(adata_late, groupby = 'leiden_group', method = 'wilcoxon', n_genes = 30, group = '36')
sc.get.rank_genes_groups_df(adata_late, group = '36')

In [None]:
sc.tl.umap(adata_late)
sc.pl.umap(adata_late, color = ['leiden_r3.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
for ct,l in ct_groups.items():
    
    filtered_dict = {k:list(set([g for g in v if g in adata_late.var_names])) for k, v in ct_markers.items() if k in l}
    sc.pl.DotPlot(adata_late, 
                  var_names = filtered_dict, 
                  groupby = 'leiden_r3.0',
                  mean_only_expressed=True,
                  cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_leidenClusters_{ct}Markers.png')

In [None]:
late_cluster_assignments = {'T_CD8_mem_recirc': [0],
                            'T_CD8_Treg': [42],
                          'Innate*': [21,31,30,15,35,4,37],
                          'T_CD8*': [1,2,27,28,34,41,40,25],
                          'T_CD4*': [15,6,7,8,20,32,18,17,33,9,29,26,36,39,44,12,10,14,16,24,38],
                          'T_αβT(entry)': [46,45,5,13,11,19,22,23,3,43],
                          'remove' : []}

late_leftover_clusters = [c for c in adata_late.obs['leiden_r3.0'].unique() if c not in [item for sublist in late_cluster_assignments.values() for item in sublist]]
late_leftover_clusters

In [None]:
# Assign temporary annotation
adata_late.obs['temp_anno'] = pd.NA
for anno,l in late_cluster_assignments.items():
    adata_late.obs.loc[adata_late.obs['leiden_r3.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_late, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_late.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_tempAnno.csv') 

In [None]:
# adata_late.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}.zarr',
#                         compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

adata_late = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}.zarr')

### Innate cells

In [None]:
adata_innate = adata_late[adata_late.obs['temp_anno'].isin(['Innate*','T_αβT(entry)'])].copy()
sc.tl.umap(adata_innate)

sc.tl.leiden(adata_innate, resolution = 3.0, key_added = f"leiden_r3.0")
adata_innate.obs[['leiden_r3.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_leidenClusters.csv')
adata_innate.obs['leiden_r3.0'] = adata_innate.obs['leiden_r3.0'].astype(int).astype('category')

# leiden_innate = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
# adata_innate.obs.drop(columns = 'leiden_r3.0', inplace = True)
# adata_innate.obs = adata_innate.obs.join(leiden_innate)
# adata_innate.obs['leiden_r3.0'] = adata_innate.obs['leiden_r3.0'].astype(int).astype('category')

sc.pl.umap(adata_innate, color = ['leiden_r3.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_lateSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
filtered_dict = {k:list(set([g for g in v if g in adata_innate.var_names])) for k, v in ct_markers.items() if k in ct_groups['innate']}
filtered_dict['T_αβT(entry)'] = ct_markers['T_⍺β(entry)']
filtered_dict['T_ɣδT'].extend(['TRDC'])
sc.pl.DotPlot(adata_innate, 
                var_names = filtered_dict, 
                groupby = 'leiden_r3.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_leidenClusters_innateMarkers.png')

In [None]:
innate_cluster_assignments = {
    'NK_tr' : [3],
    'NK_CD56lo' : [],
    'T_MAIT' : [43],
    'T_CD8⍺⍺(I)' : [2],
    'T_CD8⍺⍺(II)' : [8,11,6,10],
    'T_CD8⍺⍺(entry)' : [20,19,25,26,33,41,47],
    'T_ɣδT' : [5],
    'T_age-associated': [15],
    #'T_αβT(entry)' : [27, 13, 37,  9, 50, 29, 46, 16, 21, 34, 40, 17, 35, 24, 28, 36, 42, 7,  1, 23,  4, 30, 18, 32, 38,  0, 39, 31, 48, 22, 44, 12, 14, 45, 49, 51],
}

innate_leftover_clusters = [c for c in adata_innate.obs['leiden_r3.0'].unique() if c not in [item for sublist in innate_cluster_assignments.values() for item in sublist]]
np.array(innate_leftover_clusters)

In [None]:
adata_innate[adata_innate.obs['leiden_r3.0'] == 15].obs[['donor', 'age_group', 'study', 'age']].value_counts()

In [None]:
adata_innate.obs['leiden_group'] = adata_innate.obs['leiden_r3.0'].astype(str)
sc.tl.rank_genes_groups(adata_innate, groupby = 'leiden_group', method = 'wilcoxon', n_genes = 30, group = '15')
sc.get.rank_genes_groups_df(adata_innate, group = '15')

In [None]:
# Assign temporary annotation
adata_innate.obs['temp_anno'] = pd.NA
for anno,l in innate_cluster_assignments.items():
    adata_innate.obs.loc[adata_innate.obs['leiden_r3.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_innate, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_innate.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
sc.pl.DotPlot(adata_innate, 
                var_names = filtered_dict, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}_tempAnno_innateMarkers.png')

In [None]:
# adata_innate.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_innateSplit_scvi_{object_version}.zarr',
#                         compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

adata_innate = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_innateSplit_scvi_{object_version}.zarr')

### CD8 T cells

In [None]:
adata_cd8 = adata_late[adata_late.obs['temp_anno'].isin(['T_CD8*','T_αβT(entry)'])].copy()
sc.tl.umap(adata_cd8)

sc.tl.leiden(adata_cd8, resolution = 2.0, key_added = f"leiden_r2.0")
adata_cd8.obs['leiden_r2.0'] = adata_cd8.obs['leiden_r2.0'].astype(int).astype('category')
adata_cd8.obs[['leiden_r2.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_leidenClusters.csv')

# leiden_cd8 = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_leidenClusters.csv', index_col = 0).rename(columns = {'leiden_r3.0':'leiden_r2.0'})
# adata_cd8.obs = adata_cd8.obs.join(leiden_cd8)
# adata_cd8.obs['leiden_r2.0'] = adata_cd8.obs['leiden_r2.0'].astype(int).astype('category')

sc.pl.umap(adata_cd8, color = ['leiden_r2.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
sc.pp.highly_variable_genes(adata_cd8, n_top_genes = 1000)

In [None]:
sc.tl.dendrogram(adata_cd8, groupby='leiden_r2.0', var_names = adata_cd8.var_names[adata_cd8.var.highly_variable])

In [None]:
adata_cd8.uns['dendrogram_leiden_r2.0']

In [None]:
filtered_dict = {k:list(set([g for g in v if g in adata_cd8.var_names])) for k, v in ct_markers.items() if k in ct_groups['cd8']}
filtered_dict['T_αβT(entry)'] = list(set(ct_markers['T_⍺β(entry)']))
filtered_dict['T_CD8_CTL'].extend(['NKG7', 'ID2', 'ITGAE', 'EOMES', 'HLA-DRB1', 'CD69', 'IFNG', 'TNF', 'GNLY', 'GZMB', 'PRF1'])
filtered_dict['T_CD8_ehausted'] = ['LAG3', 'HAVCR2', 'CTLA4', 'PDCD1']
filtered_dict['T_CD8_recirc'] = ['PRDM1', 'PTPRC']
filtered_dict['T_CD8_naive'] = ['IL7R', 'LEF1', 'FOXO1']
filtered_dict['T_CD8_prolif'] = ['MKI67']
filtered_dict['T_selection'] = ['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11']
#filtered_dict.update(extravasation_markers)
sc.pl.DotPlot(adata_cd8, 
                var_names = filtered_dict, 
                groupby = 'leiden_r2.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_leidenClusters_cd8Markers.png')

In [None]:
# New
cd8_cluster_assignments = {
   'T_CD8' : [24,22,26,5,6,27,14,28,12,30,16,19],
   'T_CD8_CM_recirc' : [23],
   'T_CD8_recirc' : [3,4,13,28],
   #'T_αβT(entry)': [ 8,  0, 15,  9, 20, 17,  7,  1,  2, 18, 11, 25],
}

cd8_leftover_clusters = [c for c in adata_cd8.obs['leiden_r2.0'].unique() if c not in [item for sublist in cd8_cluster_assignments.values() for item in sublist]]
np.array(cd8_leftover_clusters)

In [None]:
sc.tl.rank_genes_groups(adata_cd8, groupby = 'temp_anno', method = 'wilcoxon', n_genes = 30, group = 'T_CD8_CM_recirc')
sc.get.rank_genes_groups_df(adata_cd8, group = 'T_CD8_CM_recirc')

In [None]:
# Assign temporary annotation
adata_cd8.obs['temp_anno'] = pd.NA
for anno,l in cd8_cluster_assignments.items():
    adata_cd8.obs.loc[adata_cd8.obs['leiden_r2.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_cd8, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_cd8.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_tempAnno.csv')

In [None]:
sc.pl.DotPlot(adata_cd8, 
                var_names = filtered_dict, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}_tempAnno_cd8Markers.png')

In [None]:
# adata_cd8.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}.zarr',
#                         compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

adata_cd8 = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd8Split_scvi_{object_version}.zarr')

Extravasation of CD4 and CD8 T cells involves their migration from the bloodstream into tissues, often as part of immune surveillance or in response to inflammation. This process is mediated by specific markers that regulate adhesion, rolling, activation, and migration. Single-cell RNA sequencing (scRNA-seq) can identify key markers associated with T cell extravasation.

### Markers for Extravasation of CD4 and CD8 T Cells
Below is a list of well-known markers typically detected in single-cell transcriptomic studies:

#### **Adhesion Molecules (Important for Rolling and Firm Adhesion)**
1. **Selectins and Ligands:**
   - **SELL (CD62L)**: Encodes L-selectin, mediates initial rolling on endothelial cells.
   - **SELPLG (CD162)**: Encodes PSGL-1, binds P- and E-selectins.
   - **SELE**: Encodes E-selectin (expressed on endothelial cells; may show up in scRNA-seq if analyzing endothelial cells near T cells).

2. **Integrins:**
   - **ITGA4 (CD49d)** and **ITGB1 (CD29)**: Form VLA-4, binds VCAM-1 on endothelium.
   - **ITGAL (CD11a)** and **ITGB2 (CD18)**: Form LFA-1, binds ICAM-1 on endothelial cells.
   - **ITGA5 (CD49e)** and **ITGB1 (CD29)**: Form VLA-5, binds fibronectin.

3. **Integrin Ligands:**
   - **ICAM1**: Encodes intercellular adhesion molecule 1.
   - **VCAM1**: Encodes vascular cell adhesion molecule 1.

---

#### **Chemokine Receptors (Guide Chemotaxis)**
1. **General Chemokine Receptors:**
   - **CCR7**: Key for lymphocyte homing; binds CCL19 and CCL21.
   - **CXCR3**: Promotes T cell migration toward inflammatory chemokines CXCL9, CXCL10, and CXCL

In [None]:
extravasation_markers = {'Selectins' : ['SELL', 'SELPLG', 'SELE'],
                         'Integrins' : ['ITGA4', 'ITGB1', 'ITGAL', 'ITGB2', 'ITGA5', 'ICAM1', 'VCAM1'],
                         'Chemokines' : ['CCR7', 'CXCR3'],
                         'Tissue residency': ['ITGAE', 'PECAM1', 'CD69']
                         }

In [None]:
sc.pl.DotPlot(adata_cd8, 
                var_names = extravasation_markers, 
                groupby = 'leiden_r2.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().show()

### CD4 T cells

In [None]:
adata_cd4 = adata_late[adata_late.obs['temp_anno'].isin(['T_CD4*','T_αβT(entry)'])].copy()
sc.tl.umap(adata_cd4)

sc.tl.leiden(adata_cd4, resolution = 3.0, key_added = f"leiden_r3.0")
adata_cd4.obs[['leiden_r3.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_leidenClusters.csv')
adata_cd4.obs['leiden_r3.0'] = adata_cd4.obs['leiden_r3.0'].astype(int).astype('category')

# leiden_cd4 = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_leidenClusters.csv', index_col = 0)
# adata_cd4.obs.drop(columns = 'leiden_r3.0', inplace = True)
# adata_cd4.obs = adata_cd4.obs.join(leiden_cd4)
# adata_cd4.obs['leiden_r3.0'] = adata_cd4.obs['leiden_r3.0'].astype(int).astype('category')

sc.pl.umap(adata_cd4, color = ['leiden_r3.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
filtered_dict = {k:list(set([g for g in v if g in adata_cd4.var_names])) for k, v in ct_markers.items() if k in ct_groups['cd4']}
filtered_dict['T_αβT(entry)'] = ct_markers['T_⍺β(entry)']
filtered_dict['T_CD4_recirc'] = ['PRDM1', 'PTPRC', 'LAG3', 'HAVCR2', 'CTLA4', 'PDCD1']
filtered_dict['T_T(agonist)'].extend(['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11'])
filtered_dict['T_CD4_FH'].extend(['CD40LG', 'PDCD1', 'BCL6', 'IL21', 'CXCL13', 'SLAMF6', 'CD200', 'MAF'])
sc.pl.DotPlot(adata_cd4, 
                var_names = filtered_dict, 
                groupby = 'leiden_r3.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_leidenClusters_cd4Markers.png')

In [None]:
# New
cd4_cluster_assignments = {
   'T_CD4' : [6,7,9,21,25,35,37,39,8],
   'T_CD4_recirc*' : [5,23,28,33,34],
   'T_CD4_h_recirc*' : [16,31,22],
   'T_Treg' : [1,2],
   'T_Treg_recirc' : [17],
   'T_T(agonist)' : [3,11,14],
   'T_iTreg' : [30,38]
   #'T_αβT(entry)': [ 0, 36, 12, 13, 20, 15, 26,  4, 24, 10, 27]
}

cd4_leftover_clusters = [c for c in adata_cd4.obs['leiden_r3.0'].unique() if c not in [item for sublist in cd4_cluster_assignments.values() for item in sublist]]
np.array(cd4_leftover_clusters)

In [None]:
# Assign temporary annotation
adata_cd4.obs['temp_anno'] = pd.NA
for anno,l in cd4_cluster_assignments.items():
    adata_cd4.obs.loc[adata_cd4.obs['leiden_r3.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_cd4, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_cd4.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_tempAnno.csv')

In [None]:
sc.pl.DotPlot(adata_cd4, 
                var_names = filtered_dict, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}_tempAnno_cd4Markers.png')

In [None]:
# adata_cd4.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}.zarr',
#                         compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

adata_cd4 = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4Split_scvi_{object_version}.zarr')

### CD4 recirc

In [None]:
adata_cd4_recirc = adata_cd4[adata_cd4.obs['temp_anno'].str.contains('\*', na = False)].copy()
sc.tl.umap(adata_cd4_recirc)

sc.tl.leiden(adata_cd4_recirc, resolution = 3.0, key_added = f"leiden_r3.0")
adata_cd4_recirc.obs[['leiden_r3.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.csv')
adata_cd4_recirc.obs['leiden_r3.0'] = adata_cd4_recirc.obs['leiden_r3.0'].astype(int).astype('category')

# leiden_cd4_recirc = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
# adata_cd4_recirc.obs.drop(columns = 'leiden_r3.0', inplace = True)
# adata_cd4_recirc.obs = adata_cd4_recirc.obs.join(leiden_cd4_recirc)
# adata_cd4_recirc.obs['leiden_r3.0'] = adata_cd4_recirc.obs['leiden_r3.0'].astype(int).astype('category')

sc.pl.umap(adata_cd4_recirc, color = ['leiden_r3.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
filtered_dict = {k:list(set([g for g in v if g in adata_cd4.var_names])) for k, v in ct_markers.items() if k in ct_groups['cd4']}
filtered_dict['T_αβT(entry)'] = ct_markers['T_⍺β(entry)']
filtered_dict['T_CD4_recirc'] = ['PRDM1', 'PTPRC', 'LAG3', 'HAVCR2', 'CTLA4', 'PDCD1']
filtered_dict['T_CD4_r1_recirc'] = ['LAG3', 'IL10', 'TGFB1', 'PRDM1', 'GZMA', 'IFNG','PDCD1']
filtered_dict['T_T(agonist)'].extend(['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11'])
filtered_dict['T_CD4_FH'].extend(['CD40LG', 'PDCD1', 'BCL6', 'IL21', 'CXCL13', 'SLAMF6', 'CD200', 'MAF'])
filtered_dict['T_CD4_h1'].extend(['IFNG', 'TNF', 'TBX21', 'STAT1', 'STAT4', 'CXCR3', 'IL12RB2', 'LY6E', 'GZMK'])
filtered_dict['T_CD4_h17'].extend(['IL17A', 'IL17F', 'IL22', 'IL21', 'RORC', 'STAT3', 'AHR', 'CCR6', 'IL23R', 'CCL20'])
sc.pl.DotPlot(adata_cd4_recirc, 
                var_names = filtered_dict, 
                groupby = 'leiden_r3.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters_cd4Markers.png')

In [None]:
cd4_recirc_cluster_assignments = {
   'T_CD4_recirc' : [1,4,9,22,23,35,13,14,12,16,17,33,45,6,8],
   'T_CD4' : [0,7,20,34,38,47,28,2,18],
   'T_CD4_h1_recirc' : [3,15,39,43,44,24,25,26,29,3,31,36,37,39,40,41,42,43,48,30,46],
   'T_CD4_fh_recirc': [19,10,21,5],
   'T_CD4_r1_recirc': [32,27,11,44],
}

cd4_recirc_leftover_clusters = [c for c in adata_cd4_recirc.obs['leiden_r3.0'].unique() if c not in [item for sublist in cd4_recirc_cluster_assignments.values() for item in sublist]]
np.array(cd4_recirc_leftover_clusters)

In [None]:
# Assign temporary annotation
adata_cd4_recirc.obs['temp_anno'] = pd.NA
for anno,l in cd4_recirc_cluster_assignments.items():
    adata_cd4_recirc.obs.loc[adata_cd4_recirc.obs['leiden_r3.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_cd4_recirc, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_cd4_recirc.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
sc.pl.DotPlot(adata_cd4_recirc, 
                var_names = filtered_dict, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}_tempAnno_cd4Markers.png')

In [None]:
# adata_cd4_recirc.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}.zarr',
#                         compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

adata_cd4_recirc = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_cd4RecircSplit_scvi_{object_version}.zarr')

## Consruct new annotations

In [None]:
ct_annot = adata.obs[['temp_anno']].copy().astype(str)

ct_annot.loc[adata_dn.obs.index, 'temp_anno'] = adata_dn.obs['temp_anno'].astype(str)
ct_annot.loc[adata_late.obs.index, 'temp_anno'] = adata_late.obs['temp_anno'].astype(str)
ct_annot.loc[adata_innate.obs[~pd.isna(adata_innate.obs['temp_anno'])].index, 'temp_anno'] = adata_innate[~pd.isna(adata_innate.obs['temp_anno'])].obs['temp_anno'].astype(str)
ct_annot.loc[adata_cd8.obs[~pd.isna(adata_cd8.obs['temp_anno'])].index, 'temp_anno'] = adata_cd8[~pd.isna(adata_cd8.obs['temp_anno'])].obs['temp_anno'].astype(str)
ct_annot.loc[adata_cd4.obs[~pd.isna(adata_cd4.obs['temp_anno'])].index, 'temp_anno'] = adata_cd4[~pd.isna(adata_cd4.obs['temp_anno'])].obs['temp_anno'].astype(str)
ct_annot.loc[adata_cd4_recirc.obs[~pd.isna(adata_cd4_recirc.obs['temp_anno'])].index, 'temp_anno'] = adata_cd4_recirc[~pd.isna(adata_cd4_recirc.obs['temp_anno'])].obs['temp_anno'].astype(str)
ct_annot.loc[ct_annot['temp_anno'].str.contains('\*'), 'temp_anno'] = 'T_αβT(entry)'

In [None]:
adata.obs.loc[ct_annot.index, 'final_anno'] = ct_annot['temp_anno']
adata.obs['final_anno'] = adata.obs['final_anno'].astype('category').cat.remove_unused_categories()

In [None]:
sc.pl.umap(adata, color = 'final_anno', wspace = 0.5)

In [None]:
adata.obs[['final_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_finalAnnoExpr.csv')

In [None]:
from utils import freq_by_donor
from plotting import plot_grouped_boxplot

df = freq_by_donor(adata, sample_col = 'sample', donor_col = 'donor', summary_col=['age_group', 'final_anno'])

df.head()

In [None]:
plot_grouped_boxplot(data = df, x = 'final_anno', y = 'mean_prop', hue = 'age_group', order = df['final_anno'].unique(), hue_order = ['infant','paed','adult', 'geriatric'], x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True)
#plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_tSplit_scvi_{object_version}_ctAnnoL4_freq.png', dpi = 300, bbox_inches = 'tight')

In [None]:
session_info.show()