# B cell frequency analysis

In [None]:
http://127.0.0.1:8889/?token=fb349271d0cf2c39c88d6d802e6b4e8a9a6c06c1d0f5845a

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import hdf5plugin
import scFates as scf

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize

## B cell frequencies

In [None]:
#Â Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[adata.obs['anno_status'] == 'include']

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
# Define columns
col_cell_type_broad = 'taa_l4'
col_cell_type_fine = 'taa_l5'
col_cell_type_broad_levels = [c for c in get_ct_levels(col_cell_type_broad, taa_l1 = ['B']) if c in adata.obs[col_cell_type_broad].unique().tolist()]
col_cell_type_fine_levels = [c for c in get_ct_levels(col_cell_type_fine, taa_l1 = ['B']) if c in adata.obs[col_cell_type_fine].unique().tolist()]
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
np.array(col_cell_type_broad_levels)

In [None]:
# Create anno df
anno_df = adata.obs[['sample', 'donor', 'sex', 'sort', 'study', col_age_group, 'age_months' ,col_cell_type_broad, col_cell_type_fine]].copy()
anno_df.dropna(subset = col_cell_type_broad, inplace = True)

anno_df.head()

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])

plot_grouped_boxplot(data = df.loc[df[col_cell_type_fine].isin(col_cell_type_fine_levels)], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = col_cell_type_fine_levels, hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 55, width = 50),
                     save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v10/thyAgeing_bSplit_freqFine')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v10/thyAgeing_bSplit_freqFine_boxplot_pub.pdf')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_broad, add_meta = [col_age_group])

plot_grouped_boxplot(data = df.loc[df[col_cell_type_broad].isin(col_cell_type_broad_levels)], x = col_cell_type_broad, y = 'mean_prop', hue = col_age_group, order = col_cell_type_broad_levels, hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 50, width = 50),
                     save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v10/thyAgeing_bSplit_freqBroad')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v10/thyAgeing_bSplit_freqBroad_boxplot_pub.pdf')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])
df = df.loc[df[col_cell_type_fine].str.contains('GC')]

plot_grouped_boxplot(data = df.loc[df[col_cell_type_fine].isin(col_cell_type_fine_levels)], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].unique().tolist()], 
                     hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 55, width = 50),
                     save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v10/thyAgeing_gcSplit_freqFine',
                     legend_kwargs = {'loc': 'center left', 'bbox_to_anchor': (1, 0.5)})
plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v10/thyAgeing_gcSplit_freqFine_boxplot.pdf')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])
df = df.loc[(df[col_cell_type_fine].str.contains('GC|plasma')) & (~df[col_cell_type_fine].str.contains('locnt'))]

plot_grouped_boxplot(data = df.loc[df[col_cell_type_fine].isin(col_cell_type_fine_levels)], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].unique().tolist()], 
                     hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 55, width = 50),
                     save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v10/thyAgeing_gcSplit_freqFine',
                     legend_kwargs = {'loc': 'center left', 'bbox_to_anchor': (1, 0.5)})
plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v10/thyAgeing_gcPlasmaSplit_freqFine_boxplot_pub.pdf')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])
df = df.loc[df[col_cell_type_fine].str.contains('plasma')]

plot_grouped_boxplot(data = df.loc[df[col_cell_type_fine].isin(col_cell_type_fine_levels)], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].unique().tolist()], 
                     hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 55, width = 40),
                     save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v10/thyAgeing_plasmaSplit_freqFine',
                     legend_kwargs = {'loc': 'center left', 'bbox_to_anchor': (1, 0.5)})
plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v10/thyAgeing_plasmaSplit_freqFine_boxplot.pdf')

## Correlation of B and Tfh cells

In [None]:
from scipy.stats import shapiro

normality_results = {}
for column in freq_df.columns:
    stat, p_value = shapiro(freq_df[column])
    normality_results[column] = {'statistic': stat, 'p_value': p_value, 'is_normal': p_value > 0.05}

# Display results
pd.DataFrame(normality_results).T

In [None]:
import scipy.stats
from statsmodels.stats.multitest import multipletests

df = pd.DataFrame()
feat1s = []
feat2s = []
corrs = []
p_values = []

freq_df = freq_by_donor(anno_df[(anno_df['sort'].isin(['TOT']))], sample_col='sample', donor_col='donor', summary_col=col_cell_type_fine, add_meta = [col_age_group, 'sex'])    
freq_df = freq_df.loc[(freq_df[col_cell_type_fine].str.startswith('B_') | freq_df[col_cell_type_fine].str.startswith('T_'))]
freq_df = freq_df.pivot(index='donor', columns=col_cell_type_fine, values='mean_prop')

for feat1 in freq_df.columns:
    for feat2 in freq_df.columns:
        if feat1 != feat2:
            feat1s.append(feat1)
            feat2s.append(feat2)
            corr, p_value = scipy.stats.spearmanr(freq_df[feat1], freq_df[feat2])
            corrs.append(corr)
            p_values.append(p_value)

df['Feature_1'] = feat1s
df['Feature_2'] = feat2s
df['Correlation'] = corrs
df['p_value'] = p_values
df['padj'] = multipletests(df['p_value'], method='fdr_bh')[1]
df['is_signif'] = df.apply(lambda x: '*' if x['padj'] < 0.05 else '', axis=1)

df.to_csv(os.path.join(data_path, 'analyses', 'freqAnalysis', 'all_curatedAnno_v10', 'thyAgeing_bCellFreqSpearmanCorr.csv'), index=False)

# Plot heatmap of correlations of cell frequencies
df_corr = df.pivot(index='Feature_1', columns='Feature_2', values='Correlation')
df_pval = df.pivot(index='Feature_1', columns='Feature_2', values='is_signif')

plt.figure(figsize=(10, 8))
sns.heatmap(df_corr, cmap='PuOr_r', center=0, vmin=-1, vmax=1, annot=np.array(df_pval), fmt = '')
plt.title('Pairwise correlations cell frequencies')
plt.xlabel('Cell population')
plt.ylabel('Cell population')
plt.savefig(os.path.join(plots_path, 'freqAnalysis', 'all_curatedAnno_v10', 'thyAgeing_bCellFreqSpearmanCorr_heatmap.pdf'), dpi=300, bbox_inches='tight')

In [None]:
# Pearson correlation of frequencies of cell populations
df.loc[(df['Feature_1'].str.contains('GC')) & (df['Feature_2'].str.contains('CD4|Treg'))]

In [None]:
# Spearman correlation of frequencies of cell populations
df.loc[(df['Feature_1'].str.contains('GC')) & (df['Feature_2'].str.contains('CD4|Treg'))]

In [None]:
session_info.show()