In [1]:
%load_ext autoreload

In [2]:
import scanpy as sc
import anndata as ad
import scirpy as ir

import pandas as pd
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%autoreload 2
import sys
sys.path.append('..')

import utils.annotation as utils_annotation

In [4]:
path_results = '../../results/mvp'
path_data_cd4 = '../../data/mvp/02_mvp_annotated_cd4.h5ad'
path_data_cd8 = '../../data/mvp/02_mvp_annotated_cd8.h5ad'

In [5]:
adata_cd4 = sc.read(path_data_cd4)
adata_cd8 = sc.read(path_data_cd8)

In [6]:
adata = adata_cd4.concatenate(adata_cd8)
adata.uns = adata_cd4.uns

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [7]:
clonotype_sequence_dict = adata.obs[['clone_id', 'clonotype_sequence']].drop_duplicates()
clonotype_sequence_dict = clonotype_sequence_dict[clonotype_sequence_dict['clone_id']!='nan']
clonotype_sequence_dict.index = clonotype_sequence_dict['clone_id']
clonotype_sequence_dict = clonotype_sequence_dict['clonotype_sequence'].to_dict()

In [8]:
v_genes_dict = adata.obs[['clone_id', 'v_genes']].drop_duplicates()
v_genes_dict = v_genes_dict[v_genes_dict['clone_id']!='nan']
v_genes_dict.index = v_genes_dict['clone_id']
v_genes_dict = v_genes_dict['v_genes'].to_dict()

In [9]:
j_genes_dict = adata.obs[['clone_id', 'j_genes']].drop_duplicates()
j_genes_dict = j_genes_dict[j_genes_dict['clone_id']!='nan']
j_genes_dict.index = j_genes_dict['clone_id']
j_genes_dict = j_genes_dict['j_genes'].to_dict()

## Mean expression value and clonal expansion per donor+sample for all epitopes

In [10]:
cts_all = adata[adata.obs['binding_minerva'].isin(adata.uns['epitopes']) 
                | adata.obs['binding_10x'].isin(adata.uns['epitopes'])].obs['clone_id']

df_all = adata[adata.obs['clone_id'].isin(cts_all)].obs
df_all = df_all[~df_all['clone_id'].isna()]
df_all_full = df_all

content = ['donor', 'pool', 'clone_id', 'binding_10x', 'binding_minerva', 'clone_size_pool_ct', 'celltype'] #+ adata.uns['epitopes'].tolist()
df_all = df_all[content].copy()
df_all['Sequence'] = df_all['clone_id'].astype(str).map(clonotype_sequence_dict)
df_all['V Genes'] = df_all['clone_id'].astype(str).map(v_genes_dict)
df_all['J Genes'] = df_all['clone_id'].astype(str).map(j_genes_dict)
df_all = df_all.sort_values(['donor', 'clone_id'])
df_all = df_all.drop_duplicates().reset_index(drop=True)
df_all.head(5)

Unnamed: 0,donor,pool,clone_id,binding_10x,binding_minerva,clone_size_pool_ct,celltype,Sequence,V Genes,J Genes
0,A04,run_2_HA3,5725.0,LTDEMIAQY,LTDEMIAQY,1.0,CD8,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1
1,A04,run_2_HA3,5726.0,No binding,FPQSAPHGV,1.0,CD8,CAQSLNKLIF CASRAGFGQPQHF,TRAV29/DV5 TRBV2,TRAJ32 TRBJ1-5
2,A04,run_2_HA3,5727.0,LTDEMIAQY,LTDEMIAQY,1.0,CD8,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5
3,A04,run_2_HA3,5728.0,No binding,LTDEMIAQY,1.0,CD8,CALSEGPNTGTASKLTF CAWDPGAGNTEAFF,TRAV19 TRBV30,TRAJ44 TRBJ1-1
4,A04,run_2_HA3,5729.0,No binding,LTDEMIAQY,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1


In [11]:
dextra_counts = []
for donor, pool, clone_id, celltype in zip(df_all['donor'], df_all['pool'], df_all['clone_id'], df_all['celltype']):
    bindings = df_all_full[(df_all_full['donor']==donor) 
                           & (df_all_full['pool']==pool)
                           & (df_all_full['clone_id']==clone_id)
                          & (df_all_full['celltype']==celltype)]
    bindings = bindings[adata.uns['epitopes']].values.mean(axis=0)
    dextra_counts.append(bindings)
dextra_counts = np.vstack(dextra_counts)
dextra_counts.shape

(2859, 16)

In [12]:
df_all[adata.uns['epitopes']] = dextra_counts
df_all.to_csv(f'{path_results}/counts_all_bindings.csv')
df_all.head(5)

Unnamed: 0,donor,pool,clone_id,binding_10x,binding_minerva,clone_size_pool_ct,celltype,Sequence,V Genes,J Genes,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
0,A04,run_2_HA3,5725.0,LTDEMIAQY,LTDEMIAQY,1.0,CD8,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1,...,,,0.0,,,,,4.0,3.0,0.0
1,A04,run_2_HA3,5726.0,No binding,FPQSAPHGV,1.0,CD8,CAQSLNKLIF CASRAGFGQPQHF,TRAV29/DV5 TRBV2,TRAJ32 TRBJ1-5,...,,,0.0,,,,,5.0,9.0,1.0
2,A04,run_2_HA3,5727.0,LTDEMIAQY,LTDEMIAQY,1.0,CD8,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5,...,,,0.0,,,,,1.0,1.0,0.0
3,A04,run_2_HA3,5728.0,No binding,LTDEMIAQY,1.0,CD8,CALSEGPNTGTASKLTF CAWDPGAGNTEAFF,TRAV19 TRBV30,TRAJ44 TRBJ1-1,...,,,0.0,,,,,1.0,1.0,1.0
4,A04,run_2_HA3,5729.0,No binding,LTDEMIAQY,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1,...,,,0.078947,,,,,1.368421,0.684211,0.105263


## All dextramer counts per cell

In [13]:
df_cell_table = adata.obs[['donor', 'pool', 'clone_id', 'celltype', 'binding_10x', 'binding_minerva',
                         ] + adata.uns['epitopes'].tolist()]
df_cell_table = df_cell_table.sort_values(['donor', 'pool', 'clone_id', 'celltype'])
df_cell_table.to_csv(f'{path_results}/dextramer_counts_by_cell.csv')
df_cell_table.head()

Unnamed: 0,donor,pool,clone_id,celltype,binding_10x,binding_minerva,LTDEMIAQY,QPYRVVVL,YLQPRTFLL,RLQSLQTYV,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
AAACCTGGTAGCCTAT-1-1-1,A04,run_2_HA3,5725.0,CD8,LTDEMIAQY,LTDEMIAQY,11.0,,,,...,,,0.0,,,,,4.0,3.0,0.0
AAACCTGGTTGAGTTC-1-1-1,A04,run_2_HA3,5726.0,CD8,No binding,FPQSAPHGV,7.0,,,,...,,,0.0,,,,,5.0,9.0,1.0
AAACCTGTCCCTAACC-1-1-1,A04,run_2_HA3,5727.0,CD8,LTDEMIAQY,LTDEMIAQY,11.0,,,,...,,,0.0,,,,,1.0,1.0,0.0
AAACCTGTCGTGGTCG-1-1-1,A04,run_2_HA3,5728.0,CD8,No binding,LTDEMIAQY,6.0,,,,...,,,0.0,,,,,1.0,1.0,1.0
AAACCTGTCTGGTGTA-1-1-1,A04,run_2_HA3,5729.0,CD8,No binding,LTDEMIAQY,8.0,,,,...,,,0.0,,,,,0.0,0.0,0.0


## Sum of bindings to all per clone separated by donor and sample

In [14]:
df_clones = adata[~adata.obs['clone_id'].isna()].obs

content = ['donor', 'pool', 'clone_id', 'celltype', 'binding_10x', 'binding_minerva', 'clone_size_pool_ct']
df_clones = df_clones[content].copy()

df_clones['Sequence'] = df_clones['clone_id'].astype(str).map(clonotype_sequence_dict)
df_clones['V Genes'] = df_clones['clone_id'].astype(str).map(v_genes_dict)
df_clones['J Genes'] = df_clones['clone_id'].astype(str).map(j_genes_dict)
df_clones = df_clones.sort_values(['donor', 'pool', 'clone_id'])
df_clones.head(5)

Unnamed: 0,donor,pool,clone_id,celltype,binding_10x,binding_minerva,clone_size_pool_ct,Sequence,V Genes,J Genes
AAACCTGGTAGCCTAT-1-1-1,A04,run_2_HA3,5725.0,CD8,LTDEMIAQY,LTDEMIAQY,1.0,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1
AAACCTGGTTGAGTTC-1-1-1,A04,run_2_HA3,5726.0,CD8,No binding,FPQSAPHGV,1.0,CAQSLNKLIF CASRAGFGQPQHF,TRAV29/DV5 TRBV2,TRAJ32 TRBJ1-5
AAACCTGTCCCTAACC-1-1-1,A04,run_2_HA3,5727.0,CD8,LTDEMIAQY,LTDEMIAQY,1.0,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5
AAACCTGTCGTGGTCG-1-1-1,A04,run_2_HA3,5728.0,CD8,No binding,LTDEMIAQY,1.0,CALSEGPNTGTASKLTF CAWDPGAGNTEAFF,TRAV19 TRBV30,TRAJ44 TRBJ1-1
AAACCTGTCTGGTGTA-1-1-1,A04,run_2_HA3,5729.0,CD8,No binding,LTDEMIAQY,38.0,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1


In [15]:
df_clones['donor+pool+clone'] = df_clones['donor'].astype(str) + \
                                  '+' + df_clones['pool'].astype(str) + \
                                  '+' + df_clones['clone_id'].astype(str) + \
                                  '+' + df_clones['clone_size_pool_ct'].astype(str)

content = {
    'donor': [],
    'pool': [],
    'clone': [],
    'clone_size_pool': [],
    'total': []
}
for el in adata.uns['epitopes']:
    content[el] = []


for donor_clones in tqdm(df_clones['donor+pool+clone'].unique()):
    donor = donor_clones.split('+')[0]
    pool = donor_clones.split('+')[1]
    clone = donor_clones.split('+')[2]
    clone_size_sample = donor_clones.split('+')[3]
    
    df_tmp = df_clones[(df_clones['donor']==donor) 
                       & (df_clones['pool']==pool)
                       & (df_clones['clone_id']==str(clone))]
    content['donor'].append(donor)
    content['pool'].append(pool)
    content['clone'].append(clone)
    content['clone_size_pool'].append(clone_size_sample)
    
    total = 0
    for ep in adata.uns['epitopes']:
        count_ep = np.sum(df_tmp['binding_minerva']==ep)
        content[ep].append(count_ep)
        total += count_ep
    content['total'].append(total)
        
df_clones_out = pd.DataFrame(content)
df_clones_out.to_csv(f'{path_results}/epitope_assignment_by_clone.csv')
df_clones_out

100%|█████████████████████████████████████████████████████████████████████████████| 5392/5392 [00:23<00:00, 225.70it/s]


Unnamed: 0,donor,pool,clone,clone_size_pool,total,LTDEMIAQY,QPYRVVVL,YLQPRTFLL,RLQSLQTYV,VLNDILSRL,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
0,A04,run_2_HA3,5725.0,1.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A04,run_2_HA3,5726.0,1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,A04,run_2_HA3,5727.0,1.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A04,run_2_HA3,5728.0,1.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A04,run_2_HA3,5729.0,38.0,33,33,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5387,MVP,run_1_HA5,726.0,1.0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5388,MVP,run_1_HA5,767.0,2.0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5389,MVP,run_1_HA5,941.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,MVP,run_1_HA5,97.0,4.0,2,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0


## Table DEGs

In [16]:
for adata in [adata_cd4, adata_cd8]:
    dict_deg = {}
    for cluster in adata.obs['leiden'].unique():
        names = adata.uns['rank_genes_groups_leiden']['names'][cluster].tolist()
        scores = adata.uns['rank_genes_groups_leiden']['scores'][cluster].tolist()
        dict_deg[cluster] = list(zip(names, scores))

    df_degs = pd.DataFrame(dict_deg)
    df_degs = df_degs[sorted(df_degs.columns.tolist())]
    df_degs.columns = [f'leiden_{el}' for el in df_degs.columns]
    df_degs.to_csv(f'{path_results}/{adata.uns["celltype"]}_deg_by_leiden_cluster.csv')

In [17]:
for adata in [adata_cd4, adata_cd8]:
    dict_deg = {}
    for cluster in adata.obs['leiden'].unique():
        names = adata.uns['rank_genes_groups_leiden_cite']['names'][cluster].tolist()
        scores = adata.uns['rank_genes_groups_leiden_cite']['scores'][cluster].tolist()
        dict_deg[cluster] = list(zip(names, scores))

    df_degs = pd.DataFrame(dict_deg)
    df_degs = df_degs[sorted(df_degs.columns.tolist())]
    df_degs.columns = [f'leiden_{el}' for el in df_degs.columns]
    df_degs.to_csv(f'{path_results}/{adata.uns["celltype"]}_cite_deg_by_leiden.csv')