In [19]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import scanpy as sc
import anndata as ad
import scirpy as ir

import pandas as pd
import numpy as np
from tqdm import tqdm

In [21]:
%autoreload 2
import sys
sys.path.append('..')

import utils.annotation as utils_annotation

In [22]:
path_results = '../../results/mvp'
path_data_cd8 = '../../data/mvp/02_mvp_annotated_cd8.h5ad'

In [23]:
binding_mode = 'binding_ct'

In [24]:
adata = sc.read(path_data_cd8)

In [25]:
clonotype_sequence_dict = adata.obs[['clone_id', 'clonotype_sequence']].drop_duplicates()
clonotype_sequence_dict = clonotype_sequence_dict[clonotype_sequence_dict['clone_id']!='nan']
clonotype_sequence_dict.index = clonotype_sequence_dict['clone_id']
clonotype_sequence_dict = clonotype_sequence_dict['clonotype_sequence'].to_dict()

In [26]:
v_genes_dict = adata.obs[['clone_id', 'v_genes']].drop_duplicates()
v_genes_dict = v_genes_dict[v_genes_dict['clone_id']!='nan']
v_genes_dict.index = v_genes_dict['clone_id']
v_genes_dict = v_genes_dict['v_genes'].to_dict()

In [27]:
j_genes_dict = adata.obs[['clone_id', 'j_genes']].drop_duplicates()
j_genes_dict = j_genes_dict[j_genes_dict['clone_id']!='nan']
j_genes_dict.index = j_genes_dict['clone_id']
j_genes_dict = j_genes_dict['j_genes'].to_dict()

## Mean expression value and clonal expansion per donor+sample for all epitopes

In [28]:
cts_all = adata[adata.obs[binding_mode].isin(adata.uns['epitopes']) 
                | adata.obs['binding_10x'].isin(adata.uns['epitopes'])].obs['clone_id']

df_all = adata[adata.obs['clone_id'].isin(cts_all)].obs
df_all = df_all[~df_all['clone_id'].isna()]
df_all_full = df_all

content = ['donor', 'pool', 'clone_id', 'binding_10x', binding_mode, 'clone_size_pool_ct', 'celltype'] #+ adata.uns['epitopes'].tolist()
df_all = df_all[content].copy()
df_all['Sequence'] = df_all['clone_id'].astype(str).map(clonotype_sequence_dict)
df_all['V Genes'] = df_all['clone_id'].astype(str).map(v_genes_dict)
df_all['J Genes'] = df_all['clone_id'].astype(str).map(j_genes_dict)
df_all = df_all.sort_values(['donor', 'clone_id'])
df_all = df_all.drop_duplicates().reset_index(drop=True)
df_all.head(5)

Unnamed: 0,donor,pool,clone_id,binding_10x,binding_ct,clone_size_pool_ct,celltype,Sequence,V Genes,J Genes
0,A04,run_2_HA3,5725.0,LTDEMIAQY,No binding,1.0,CD8,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1
1,A04,run_2_HA3,5727.0,LTDEMIAQY,No binding,1.0,CD8,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5
2,A04,run_2_HA3,5729.0,No binding,No binding,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1
3,A04,run_2_HA3,5729.0,LTDEMIAQY,No binding,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1
4,A04,run_2_HA3,5733.0,LTDEMIAQY,No binding,5.0,CD8,CAASIGNFGNEKLTF CASSPSRNTEAFF,TRAV23/DV6 TRBV4-3,TRAJ48 TRBJ1-1


In [29]:
dextra_counts = []
for donor, pool, clone_id, celltype in zip(df_all['donor'], df_all['pool'], df_all['clone_id'], df_all['celltype']):
    bindings = df_all_full[(df_all_full['donor']==donor) 
                           & (df_all_full['pool']==pool)
                           & (df_all_full['clone_id']==clone_id)
                          & (df_all_full['celltype']==celltype)]
    bindings = bindings[adata.uns['epitopes']].values.mean(axis=0)
    dextra_counts.append(bindings)
dextra_counts = np.vstack(dextra_counts)
dextra_counts.shape

(1225, 16)

In [30]:
df_all[adata.uns['epitopes']] = dextra_counts
df_all.to_csv(f'{path_results}/counts_all_bindings_{binding_mode}.csv')
df_all.head(5)

Unnamed: 0,donor,pool,clone_id,binding_10x,binding_ct,clone_size_pool_ct,celltype,Sequence,V Genes,J Genes,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
0,A04,run_2_HA3,5725.0,LTDEMIAQY,No binding,1.0,CD8,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1,...,,,0.0,,,,,4.0,3.0,0.0
1,A04,run_2_HA3,5727.0,LTDEMIAQY,No binding,1.0,CD8,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5,...,,,0.0,,,,,1.0,1.0,0.0
2,A04,run_2_HA3,5729.0,No binding,No binding,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1,...,,,0.078947,,,,,1.368421,0.684211,0.105263
3,A04,run_2_HA3,5729.0,LTDEMIAQY,No binding,38.0,CD8,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1,...,,,0.078947,,,,,1.368421,0.684211,0.105263
4,A04,run_2_HA3,5733.0,LTDEMIAQY,No binding,5.0,CD8,CAASIGNFGNEKLTF CASSPSRNTEAFF,TRAV23/DV6 TRBV4-3,TRAJ48 TRBJ1-1,...,,,0.2,,,,,2.6,0.2,0.2


## All dextramer counts per cell

In [31]:
df_cell_table = adata.obs[['donor', 'pool', 'clone_id', 'celltype', 'binding_10x',
                           binding_mode,
                         ] + adata.uns['epitopes'].tolist()]
df_cell_table = df_cell_table.sort_values(['donor', 'pool', 'clone_id', 'celltype'])
df_cell_table.to_csv(f'{path_results}/dextramer_counts_by_cell_{binding_mode}.csv')
df_cell_table.head()

Unnamed: 0,donor,pool,clone_id,celltype,binding_10x,binding_ct,LTDEMIAQY,QPYRVVVL,YLQPRTFLL,RLQSLQTYV,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
AAACCTGGTAGCCTAT-1-1,A04,run_2_HA3,5725.0,CD8,LTDEMIAQY,No binding,11.0,,,,...,,,0.0,,,,,4.0,3.0,0.0
AAACCTGGTTGAGTTC-1-1,A04,run_2_HA3,5726.0,CD8,No binding,No binding,7.0,,,,...,,,0.0,,,,,5.0,9.0,1.0
AAACCTGTCCCTAACC-1-1,A04,run_2_HA3,5727.0,CD8,LTDEMIAQY,No binding,11.0,,,,...,,,0.0,,,,,1.0,1.0,0.0
AAACCTGTCGTGGTCG-1-1,A04,run_2_HA3,5728.0,CD8,No binding,No binding,6.0,,,,...,,,0.0,,,,,1.0,1.0,1.0
AAACCTGTCTGGTGTA-1-1,A04,run_2_HA3,5729.0,CD8,No binding,No binding,8.0,,,,...,,,0.0,,,,,0.0,0.0,0.0


## Sum of bindings to all per clone separated by donor and sample

In [32]:
df_clones = adata[~adata.obs['clone_id'].isna()].obs

content = ['donor', 'pool', 'clone_id', 'celltype', 'binding_10x', 
           binding_mode, 'clone_size_pool_ct']
df_clones = df_clones[content].copy()

df_clones['Sequence'] = df_clones['clone_id'].astype(str).map(clonotype_sequence_dict)
df_clones['V Genes'] = df_clones['clone_id'].astype(str).map(v_genes_dict)
df_clones['J Genes'] = df_clones['clone_id'].astype(str).map(j_genes_dict)
df_clones = df_clones.sort_values(['donor', 'pool', 'clone_id'])
df_clones.head(5)

Unnamed: 0,donor,pool,clone_id,celltype,binding_10x,binding_ct,clone_size_pool_ct,Sequence,V Genes,J Genes
AAACCTGGTAGCCTAT-1-1,A04,run_2_HA3,5725.0,CD8,LTDEMIAQY,No binding,1.0,CAYTYKYIF CASSRREMNTEAFF,TRAV38-1 TRBV14,TRAJ40 TRBJ1-1
AAACCTGGTTGAGTTC-1-1,A04,run_2_HA3,5726.0,CD8,No binding,No binding,1.0,CAQSLNKLIF CASRAGFGQPQHF,TRAV29/DV5 TRBV2,TRAJ32 TRBJ1-5
AAACCTGTCCCTAACC-1-1,A04,run_2_HA3,5727.0,CD8,LTDEMIAQY,No binding,1.0,CAVKVGGYQKVTF CASSEVSNQPQHF,TRAV12-2 TRBV6-1,TRAJ13 TRBJ1-5
AAACCTGTCGTGGTCG-1-1,A04,run_2_HA3,5728.0,CD8,No binding,No binding,1.0,CALSEGPNTGTASKLTF CAWDPGAGNTEAFF,TRAV19 TRBV30,TRAJ44 TRBJ1-1
AAACCTGTCTGGTGTA-1-1,A04,run_2_HA3,5729.0,CD8,No binding,No binding,38.0,CAVLSPGIGARLMF CASTGIGSGNTEAFF,TRAV20 TRBV2,TRAJ31 TRBJ1-1


In [33]:
df_clones['donor+pool+clone'] = df_clones['donor'].astype(str) + \
                                  '+' + df_clones['pool'].astype(str) + \
                                  '+' + df_clones['clone_id'].astype(str) + \
                                  '+' + df_clones['clone_size_pool_ct'].astype(str)

content = {
    'donor': [],
    'pool': [],
    'clone': [],
    'clone_size_pool': [],
    'total': []
}
for el in adata.uns['epitopes']:
    content[el] = []


for donor_clones in tqdm(df_clones['donor+pool+clone'].unique()):
    donor = donor_clones.split('+')[0]
    pool = donor_clones.split('+')[1]
    clone = donor_clones.split('+')[2]
    clone_size_sample = donor_clones.split('+')[3]
    
    df_tmp = df_clones[(df_clones['donor']==donor) 
                       & (df_clones['pool']==pool)
                       & (df_clones['clone_id']==str(clone))]
    content['donor'].append(donor)
    content['pool'].append(pool)
    content['clone'].append(clone)
    content['clone_size_pool'].append(clone_size_sample)
    
    total = 0
    for ep in adata.uns['epitopes']:
        count_ep = np.sum(df_tmp['binding_10x']==ep)
        content[ep].append(count_ep)
        total += count_ep
    content['total'].append(total)
        
df_clones_out = pd.DataFrame(content)
df_clones_out.to_csv(f'{path_results}/epitope_assignment_by_clone_10x.csv')
df_clones_out

100%|█████████████████████████████████████████████████████████████████████████████| 2787/2787 [00:11<00:00, 237.59it/s]


Unnamed: 0,donor,pool,clone,clone_size_pool,total,LTDEMIAQY,QPYRVVVL,YLQPRTFLL,RLQSLQTYV,VLNDILSRL,...,YTNSFTRGVY,NYNYLYRLF,TFEYVSQPFLMDLE,ATDSLNNEY,CTELKLSDY,FLRGRAYGL,RAKFKQLL,SPRRARSVA,FPQSAPHGV,IYKTPPIKDF
0,A04,run_2_HA3,5725.0,1.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A04,run_2_HA3,5726.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A04,run_2_HA3,5727.0,1.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A04,run_2_HA3,5728.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A04,run_2_HA3,5729.0,38.0,6,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2782,MVP,run_1_HA5,4550.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2783,MVP,run_1_HA5,4986.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2784,MVP,run_1_HA5,5224.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2785,MVP,run_1_HA5,5698.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Table DEGs

In [34]:
dict_deg = {}
for cluster in adata.obs['leiden'].unique():
    names = adata.uns['rank_genes_groups_leiden']['names'][cluster].tolist()
    scores = adata.uns['rank_genes_groups_leiden']['scores'][cluster].tolist()
    dict_deg[cluster] = list(zip(names, scores))

df_degs = pd.DataFrame(dict_deg)
df_degs = df_degs[sorted(df_degs.columns.tolist())]
df_degs.columns = [f'leiden_{el}' for el in df_degs.columns]
df_degs.to_csv(f'{path_results}/{adata.uns["celltype"]}_deg_by_leiden_cluster_8clusters.csv')

In [35]:
dict_deg = {}
for cluster in adata.obs['leiden'].unique():
    names = adata.uns['rank_genes_groups_leiden_cite']['names'][cluster].tolist()
    scores = adata.uns['rank_genes_groups_leiden_cite']['scores'][cluster].tolist()
    dict_deg[cluster] = list(zip(names, scores))

df_degs = pd.DataFrame(dict_deg)
df_degs = df_degs[sorted(df_degs.columns.tolist())]
df_degs.columns = [f'leiden_{el}' for el in df_degs.columns]
df_degs.to_csv(f'{path_results}/{adata.uns["celltype"]}_cite_deg_by_leiden_8clusters.csv')

## Binding mode table

In [36]:
df_tmp = adata[adata.obs[binding_mode]!='No binding'].obs.copy()
df_tmp = df_tmp[['clone_id', binding_mode, 'clone_size', 
                 'donor', 'clone_size_donor_ct', binding_mode]]
df_tmp.index = df_tmp['clone_id']
df_tmp = df_tmp.drop(columns=['clone_id'])
df_tmp.to_csv(f'{path_results}/specific_clones_by_{binding_mode}.csv')
df_tmp.head()

Unnamed: 0_level_0,binding_ct,clone_size,donor,clone_size_donor_ct,binding_ct
clone_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3.0,RAKFKQLL,10.0,A07,10.0,RAKFKQLL
9.0,CTELKLSDY,1.0,A07,1.0,CTELKLSDY
24.0,CTELKLSDY,8.0,A07,8.0,CTELKLSDY
3.0,RAKFKQLL,10.0,A07,10.0,RAKFKQLL
41.0,LTDEMIAQY,5.0,MVP,4.0,LTDEMIAQY
