In [1]:
import scanpy as sc

# Load data

In [2]:
path_data = '../data/raw/02_dex_annotated_cd8.h5ad'
df_cells = sc.read(path_data).obs

# Filter non-Binders

In [3]:
print(f'All cells: {len(df_cells)}')
df_cells = df_cells[df_cells['binding_ct']!='No binding'].copy()
print(f'Binding cells: {len(df_cells)}')

All cells: 53907
Binding cells: 1636


# Extract TCR Information

In [4]:
df_cells = df_cells[['clone_id', 'clonotype_sequence', 'v_genes', 'j_genes', 'binding_ct']]
for i, chain in enumerate(['VJ', 'VDJ']):
    for col_old, col_new in [('clonotype_sequence', 'junction_aa', ), ('v_genes', 'v_call'), ('j_genes', 'j_call')]:
        df_cells[f'IR_{chain}_1_{col_new}'] = df_cells[col_old].str.split(' ').str[i]

df_cells = df_cells.rename(columns={'binding_ct': 'epitope'})
df_cells['IR_VJ_1_c_call'] = None
df_cells['IR_VDJ_1_c_call'] = None
df_cells['IR_VJ_1_d_call'] = None
df_cells['IR_VDJ_1_d_call'] = None

df_cells = df_cells[['IR_VJ_1_junction_aa', 'IR_VDJ_1_junction_aa',
                    'IR_VJ_1_c_call', 'IR_VDJ_1_c_call',
                    'IR_VJ_1_v_call', 'IR_VDJ_1_v_call',    
                    'IR_VJ_1_d_call', 'IR_VDJ_1_d_call',
                    'IR_VJ_1_j_call', 'IR_VDJ_1_j_call',
                    'epitope', 'clone_id']]
df_cells.head(5)

Unnamed: 0,IR_VJ_1_junction_aa,IR_VDJ_1_junction_aa,IR_VJ_1_c_call,IR_VDJ_1_c_call,IR_VJ_1_v_call,IR_VDJ_1_v_call,IR_VJ_1_d_call,IR_VDJ_1_d_call,IR_VJ_1_j_call,IR_VDJ_1_j_call,epitope,clone_id
AACTCAGCACCCAGTG-1-initial-0-0,CAASKGGGGKLIF,CSARQGRWEQYF,,,TRAV13-1,TRBV20-1,,,TRAJ23,TRBJ2-7,QYIKWPWYI,20.0
AACTCAGTCAGGTAAA-1-initial-0-0,CGTPINSGYALNF,CASSVGSLPTNEKLFF,,,TRAV30,TRBV9,,,TRAJ41,TRBJ1-4,KCYGVSPTK,22.0
AACTCAGTCATTATCC-1-initial-0-0,CIVRVVNQAGTALIF,CASSISGSRGEQFF,,,TRAV26-1,TRBV19,,,TRAJ15,TRBJ2-1,KCYGVSPTK,23.0
AACTGGTAGATGTGTA-1-initial-0-0,CAVNIDDKIIF,CASSPDIEQFF,,,TRAV12-2,TRBV7-9,,,TRAJ30,TRBJ2-1,YLQPRTFLL,28.0
AAGGAGCGTACAGTGG-1-initial-0-0,CVVGTNNAGNMLTF,CASSPETGVGNQPQHF,,,TRAV10,TRBV4-2,,,TRAJ39,TRBJ1-5,KCYGVSPTK,34.0


# Remove Cross-Reactives clones

In [5]:
crossreactive_clones = df_cells.groupby('clone_id')['epitope'].nunique()
crossreactive_clones = crossreactive_clones[crossreactive_clones>1].index.tolist()

print(f'With cross-reactive clones: {len(df_cells)}')
df_cells = df_cells[~df_cells['clone_id'].isin(crossreactive_clones)]
print(f'WO cross-reactive clones: {len(df_cells)}')

With cross-reactive clones: 1636
WO cross-reactive clones: 1636


# Reduce to clones

In [6]:
print(f'Amount cells: {len(df_cells)}')
df_clones = df_cells.drop_duplicates()
print(f'Amount clones: {len(df_clones)}')
df_clones['epitope'].value_counts()

Amount cells: 1636
Amount clones: 667


LTDEMIAQY     131
KCYGVSPTK     130
QYIKWPWYI     124
NYNYLYRLF     121
YLQPRTFLL      70
SPRRARSVA      31
CTELKLSDY      14
YTNSFTRGVY     13
RAKFKQLL       11
FPQSAPHGV       9
RLQSLQTYV       7
FLRGRAYGL       3
QPYRVVVL        2
VLNDILSRL       1
No binding      0
Name: epitope, dtype: int64

# Save data

In [7]:
df_clones.to_csv('../data/02_covid_dextramer_clones.csv')

In [8]:
df_clones['clone_id'].nunique()

667