This scirpt extracts the epitope binding assignment by TCR from the 10x Application note "A new way of exploring immunity" (https://pages.10xgenomics.com/rs/446-PBO-704/images/10x_AN047_IP_A_New_Way_of_Exploring_Immunity_Digital.pdf)

Required packages that are note included in the yaml environment:
- scanpy
- scirpy


In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from tqdm import tqdm

import scanpy as sc
import scirpy as ir

In [2]:
path_base = '10x/'

In [3]:
dfs = []
for i in range(1, 5):
    
    # TCR data
    path_anno = path_base + f'patient_{i}/vdj_v1_hs_aggregated_donor{i}_all_contig_annotations.csv'
    df_tcr = ir.io.read_10x_vdj(path_anno).obs   
        
    # Binding data
    path_binding = path_base + f'patient_{i}/vdj_v1_hs_aggregated_donor{i}_binarized_matrix.csv'
    binarized_matrix = pd.read_csv(path_binding, sep=',', header=0)

    df_tcr['Donor'] = f'Patient_{i}'
    
    for k in binarized_matrix.columns:
        df_tcr[k] = None
        df_tcr.loc[binarized_matrix['barcode'].values, k] = binarized_matrix[k].values
    
    dfs.append(df_tcr)

df_binding = pd.concat(dfs)

... storing 'IR_VJ_1_cdr3' as categorical
... storing 'IR_VJ_2_cdr3' as categorical
... storing 'IR_VDJ_1_cdr3' as categorical
... storing 'IR_VDJ_2_cdr3' as categorical
... storing 'IR_VJ_1_cdr3_nt' as categorical
... storing 'IR_VJ_2_cdr3_nt' as categorical
... storing 'IR_VDJ_1_cdr3_nt' as categorical
... storing 'IR_VDJ_2_cdr3_nt' as categorical
... storing 'IR_VJ_1_junction_ins' as categorical
... storing 'IR_VJ_2_junction_ins' as categorical
... storing 'IR_VDJ_1_junction_ins' as categorical
... storing 'IR_VDJ_2_junction_ins' as categorical
... storing 'IR_VJ_1_cdr3' as categorical
... storing 'IR_VJ_2_cdr3' as categorical
... storing 'IR_VDJ_1_cdr3' as categorical
... storing 'IR_VDJ_2_cdr3' as categorical
... storing 'IR_VJ_1_cdr3_nt' as categorical
... storing 'IR_VJ_2_cdr3_nt' as categorical
... storing 'IR_VDJ_1_cdr3_nt' as categorical
... storing 'IR_VDJ_2_cdr3_nt' as categorical
... storing 'IR_VJ_1_junction_ins' as categorical
... storing 'IR_VJ_2_junction_ins' as catego

In [4]:
binding_cols = [el for el in df_binding if el.endswith('binder')]

def extract_binding(row):
    for el in binding_cols:
        if row[el]:
            return el
    return None

df_binding['Epitope'] = df_binding.apply(extract_binding, axis=1)

## Filtering

In [5]:
# Drop Multi-chains and cells without measured CDR3
df_binding = df_binding[df_binding['multi_chain']!=True]
print(len(df_binding))
df_binding = df_binding[df_binding['IR_VDJ_1_cdr3']!='None']
print(len(df_binding))

192904
186768


In [6]:
# Drop cells without binder
print(len(df_binding))
df_binding = df_binding[~df_binding['Epitope'].isna()]
print(len(df_binding))

186768
85026


In [7]:
df_binding['Epitope'] = df_binding['Epitope'].str.split('_').str[1]

In [8]:
# Drop Columns not needed
df_binding = df_binding[
    ['Epitope', 'IR_VJ_1_cdr3', 'IR_VDJ_1_cdr3', 'IR_VJ_1_v_gene', 'IR_VDJ_1_v_gene', 
     'IR_VJ_1_d_gene', 'IR_VDJ_1_d_gene', 'IR_VJ_1_j_gene', 'IR_VDJ_1_j_gene', 
     'IR_VJ_1_c_gene', 'IR_VDJ_1_c_gene', 'Donor']]

df_binding.head(5)

Unnamed: 0_level_0,Epitope,IR_VJ_1_cdr3,IR_VDJ_1_cdr3,IR_VJ_1_v_gene,IR_VDJ_1_v_gene,IR_VJ_1_d_gene,IR_VDJ_1_d_gene,IR_VJ_1_j_gene,IR_VDJ_1_j_gene,IR_VJ_1_c_gene,IR_VDJ_1_c_gene,Donor
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACCTGAGATTACCC-16,AVFDRKSDAK,CAVGDNFNKFYF,CASSLYSATGELFF,TRAV8-3,TRBV28,,,TRAJ21,TRBJ2-2,TRAC,TRBC2,Patient_1
AAACCTGAGGTGCACA-13,AVFDRKSDAK,CAVGDNFNKFYF,CASSLYSATGELFF,TRAV8-3,TRBV28,,,TRAJ21,TRBJ2-2,TRAC,TRBC2,Patient_1
AAACCTGAGTCTCGGC-10,KLGGALQAK,CALRTYKYIF,CASGYWKLAGGPQETQYF,TRAV19,TRBV7-2,,TRBD2,TRAJ40,TRBJ2-5,TRAC,TRBC2,Patient_1
AAACCTGAGTGGGTTG-17,GILGFVFTL,,CASTDFGSGANVLTF,,TRBV19,,,,TRBJ2-6,,TRBC2,Patient_1
AAACCTGAGTTTAGGA-39,IVTDFSVIK,,CASSWGGGSHYGYTF,,TRBV11-2,,TRBD1,,TRBJ1-2,,TRBC1,Patient_1


In [9]:
df_binding = df_binding.drop_duplicates(['Epitope', 'IR_VDJ_1_cdr3'])
print(len(df_binding))

18434


In [11]:
rename_dict = {
    'IR_VJ_1_cdr3': 'TRA_cdr3',
    'IR_VDJ_1_cdr3': 'TRB_cdr3', 
    'IR_VJ_1_v_gene': 'TRA_V', 
    'IR_VDJ_1_v_gene': 'TRB_V',
    'IR_VJ_1_d_gene': 'TRA_D', 
    'IR_VDJ_1_d_gene': 'TRB_D', 
    'IR_VJ_1_j_gene': 'TRA_J', 
    'IR_VDJ_1_j_gene': 'TRB_J', 
    'IR_VJ_1_c_gene': 'TRA_C', 
    'IR_VDJ_1_c_gene': 'TRB_C',
}
df_binding = df_binding.rename(columns=rename_dict)

In [12]:
# Exclude CDR3b binding to multiple epitopes
epitope_counts = df_binding['TRB_cdr3'].value_counts()
multi_assignment = epitope_counts.index[epitope_counts>1]

print(len(df_binding))
df_binding = df_binding[~df_binding['TRB_cdr3'].isin(multi_assignment)]      
print(len(df_binding))

18434
15865


## Saving

In [14]:
df_binding = df_binding.reset_index(drop=True)
df_binding.to_csv('10x/10x_filtered_bindings.csv')