# Data Preperation for TESSA evaluation

Here, we perpare our data to the format of TESSA from:

Zhang Z, Xiong D, Wang X, Liu H, Wang T. Mapping the functional landscape of T cell receptor repertoires by single-T cell transcriptomics. Nat Methods. 2021.

https://www.nature.com/articles/s41592-020-01020-3

Clone the Github repository from https://github.com/jcao89757/TESSA to the folder 'baseline'.

In [1]:
import scanpy as sc
import pandas as pd
import os

from tqdm import tqdm

import sys

In [2]:
sys.path.insert(0, '../mvTCR')
from tcr_embedding.utils_preprocessing import group_shuffle_split
import config.constants_10x as const

  from .autonotebook import tqdm as notebook_tqdm


## Haniffa Dataset

In [3]:
path_haniffa_adata = '../mvTCR/data/Haniffa/v3_conditional.h5ad'
adata = sc.read_h5ad(path_haniffa_adata)

In [4]:
splits_haniffa = {}

for i in tqdm(range(0, 5)):
    random_seed = i
    adata_tmp = adata.copy()
    sc.pp.subsample(adata_tmp, n_obs=20000, random_state=random_seed)
    train, val = group_shuffle_split(adata_tmp, group_col='cdr3_beta', val_split=0.20, random_seed=random_seed)

    adata_tmp.obs['set'] = 'train'
    adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
    adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'val'])]

    splits_haniffa[f'split_{i}'] = adata_tmp.copy()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.75s/it]


In [5]:
path_base = f'../mvTCR/data/tessa/Haniffa/'
if not os.path.exists(path_base):
    os.mkdir(path_base)

for split, adata in tqdm(splits_haniffa.items()):
    df_tcr = adata.obs[['cdr3_beta', 'full_clustering' , 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['cdr3_beta']]

    df_tcr = df_tcr[['cdr3', 'full_clustering', 'set']]
    df_tcr.index.name = 'contig_id'

    df_tcr.to_csv(path_base+f'{split}_tcrs_atlas.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['cdr3_beta']]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.96it/s]


In [6]:
path_base = f'../mvTCR/data/tessa/Haniffa/'

for split, adata in tqdm(splits_haniffa.items()):
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)

    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts = df_counts.transpose()
    df_counts.to_csv(path_base+f'{split}_scRNA_atlas.csv')

100%|████████████████████████████████████████████████████████████████████| 5/5 [04:28<00:00, 53.77s/it]


In [7]:
del splits_haniffa

## Borcherding Dataset

In [8]:
path_borch_adata = '../mvTCR/data/Borcherding/04_borch_annotated.h5ad'
adata = sc.read_h5ad(path_borch_adata)
adata.obs = adata.obs.replace('nan', 'Unknown')

In [9]:
splits_borch = {}

for i in tqdm(range(0, 5)):
    random_seed = i
    adata_tmp = adata.copy()
    sc.pp.subsample(adata_tmp, n_obs=20000, random_state=random_seed)
    train, val = group_shuffle_split(adata_tmp, group_col='IR_VDJ_1_junction_aa', val_split=0.20, random_seed=random_seed)

    adata_tmp.obs['set'] = 'train'
    adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
    adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'val'])]

    splits_borch[f'split_{i}'] = adata_tmp.copy()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.91s/it]


In [10]:
path_base = f'../mvTCR/data/tessa/Borcherding/'
if not os.path.exists(path_base):
    os.mkdir(path_base)

for split, adata in tqdm(splits_borch.items()):
    df_tcr = adata.obs[['IR_VDJ_1_junction_aa', 'functional.cluster' , 'set']]
    df_tcr['functional.cluster'] = df_tcr['functional.cluster'].astype(str)
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]
    

    df_tcr = df_tcr[['cdr3', 'functional.cluster', 'set']]
    df_tcr.index.name = 'contig_id'

    df_tcr.to_csv(path_base+f'{split}_tcrs_atlas.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['functional.cluster'] = df_tcr['functional.cluster'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.53it/s]


In [11]:
path_base = f'../mvTCR/data/tessa/Borcherding/'

for split, adata in tqdm(splits_borch.items()):
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)

    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts = df_counts.transpose()
    df_counts.to_csv(path_base+f'{split}_scRNA_atlas.csv')

100%|████████████████████████████████████████████████████████████████████| 5/5 [04:31<00:00, 54.32s/it]


In [12]:
del splits_borch

## Minervina Data

In [13]:
path_min_adata = '../mvTCR/data/Minervina/01_annotated_data.h5ad'
adata = sc.read_h5ad(path_min_adata)

In [14]:
splits_min = {}

for i in tqdm(range(0, 5)):
    random_seed = i
    adata_tmp = adata.copy()
    train_val, test = group_shuffle_split(adata_tmp, group_col='IR_VDJ_1_junction_aa', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='IR_VDJ_1_junction_aa', val_split=0.25, random_seed=random_seed)

    adata_tmp.obs['set'] = 'train'
    adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
    adata_tmp.obs.loc[test.obs.index, 'set'] = 'test'
    adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'test'])]

    splits_min[f'split_{i}'] = adata_tmp.copy()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.74it/s]


In [15]:
path_base = f'../mvTCR/data/tessa/Minervina/'
if not os.path.exists(path_base):
    os.mkdir(path_base)

for split, adata in tqdm(splits_min.items()):
    df_tcr = adata.obs[['IR_VDJ_1_junction_aa', 'epitope', 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]

    df_tcr = df_tcr[['cdr3', 'epitope', 'set']]
    df_tcr.index.name = 'contig_id'

    df_tcr_train = df_tcr[df_tcr['set']=='train']
    df_tcr_test = df_tcr[df_tcr['set']=='test']

    df_tcr_train.to_csv(path_base+f'{split}_tcrs_atlas.csv')
    df_tcr_test.to_csv(path_base+f'{split}_tcrs_query.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.53it/s]


In [16]:
path_base = f'../mvTCR/data/tessa/Minervina/'

for split, adata in tqdm(splits_min.items()):
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)

    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts_train = df_counts[adata.obs['set']=='train']
    df_counts_test = df_counts[adata.obs['set']=='test']


    df_counts_train = df_counts_train.transpose()
    df_counts_test = df_counts_test.transpose()

    df_counts_train.to_csv(path_base+f'{split}_scRNA_atlas.csv')
    df_counts_test.to_csv(path_base+f'{split}_scRNA_query.csv')

100%|████████████████████████████████████████████████████████████████████| 5/5 [01:02<00:00, 12.40s/it]


In [17]:
del splits_min

# 10x Dataset

In [18]:
path_10x_adata = '../mvTCR/data/10x_CD8TC/v6_supervised.h5ad'
adata = sc.read_h5ad(path_10x_adata)

adatas_per_donor = []
for i in range(1, 6):
    if i == 5:
        new_ad = adata
    else:
        new_ad = adata[adata.obs['donor']== f'donor_{i}']
    new_ad.obs['binding_name'] = new_ad.obs['binding_name'].astype(str)
    new_ad = new_ad[(new_ad.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS))]
    adatas_per_donor.append(new_ad.copy())

  new_ad.obs['binding_name'] = new_ad.obs['binding_name'].astype(str)


In [19]:
splits = {}

for j, adata_donor in enumerate(adatas_per_donor):
    splits[f'donor_{j+1}' if j < 4 else 'full'] = {}
    adata_donor.obs['group_col'] = [seq[1:-1] for seq in adata_donor.obs['IR_VDJ_1_junction_aa']]
    for i in tqdm(range(0, 5)):
        random_seed = i
        adata_tmp = adata_donor.copy()
        train_val, test = group_shuffle_split(adata_tmp, group_col='group_col', val_split=0.20, random_seed=random_seed)
        train, val = group_shuffle_split(train_val, group_col='group_col', val_split=0.25, random_seed=random_seed)
        
        adata_tmp.obs['set'] = 'train'
        adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
        adata_tmp.obs.loc[test.obs.index, 'set'] = 'test'
        
        adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'test'])]
        splits[f'donor_{j+1}' if j < 4 else 'full'][f'split_{i}'] = adata_tmp.copy()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.70it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.02s/it]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.10it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.30it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.06s/it]


Write the TCR sequences to seperate csv file for each donor:

In [20]:
for donor, ad_dict in splits.items():
    for split, adata in tqdm(ad_dict.items()):
        path_base = f'../mvTCR/data/tessa/10x/{donor}/'
        if not os.path.exists(path_base):
            os.mkdir(path_base)

        df_tcr = adata.obs[['IR_VDJ_1_junction_aa', 'binding_name', 'set']]
        df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]

        df_tcr = df_tcr[['cdr3', 'binding_name', 'set']]
        df_tcr.index.name = 'contig_id'

        df_tcr_train = df_tcr[df_tcr['set']=='train']
        df_tcr_test = df_tcr[df_tcr['set']=='test']

        df_tcr_train.to_csv(path_base+f'{split}_tcrs_atlas.csv')
        df_tcr_test.to_csv(path_base+f'{split}_tcrs_query.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.13it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.94it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  8.26it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.80it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.72it/s]


Write the scRNA matrix to seperate csv file for each donor:

In [21]:
for donor, ad_dict in splits.items():
    for split, adata in tqdm(ad_dict.items()):
        path_base = f'../mvTCR/data/tessa/10x/{donor}/'
        count_mat = adata.X.A
        df_counts = pd.DataFrame(count_mat)

        df_counts.index = adata.obs.index
        df_counts.index.name = ''
        df_counts.columns = adata.var.index

        df_counts_train = df_counts[adata.obs['set']=='train']
        df_counts_test = df_counts[adata.obs['set']=='test']


        df_counts_train = df_counts_train.transpose()
        df_counts_test = df_counts_test.transpose()

        df_counts_train.to_csv(path_base+f'{split}_scRNA_atlas.csv')
        df_counts_test.to_csv(path_base+f'{split}_scRNA_query.csv')

100%|████████████████████████████████████████████████████████████████████| 5/5 [01:14<00:00, 14.85s/it]
100%|████████████████████████████████████████████████████████████████████| 5/5 [04:25<00:00, 53.02s/it]
100%|████████████████████████████████████████████████████████████████████| 5/5 [03:00<00:00, 36.04s/it]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:30<00:00,  6.14s/it]
100%|███████████████████████████████████████████████████████████████████| 5/5 [16:42<00:00, 200.42s/it]


In [22]:
del splits

## Transform the Covid Dataset

In [23]:
def extract_reactivity(value):
    if 'un-reactive' in value:
        return 'un-reactive'
    if 'CD8' in value:
        return 'CD8 reactive'
    if 'CD4' in value:
        return 'CD4 reactive'

In [24]:
path_covid_adata = '../mvTCR/data/Covid/04_covid_highly_var_5000.h5ad'
adata = sc.read_h5ad(path_covid_adata)
adata.obs['reactivity'] = [extract_reactivity(x) for x in adata.obs['reactive_combined']]

In [25]:
splits_covid = {}

for i in tqdm(range(0, 5)):
    random_seed = i
    adata_tmp = adata.copy()
    sub, non_sub = group_shuffle_split(adata_tmp, group_col='TRB_1_cdr3', val_split=0.2, random_seed=random_seed)
    train, val = group_shuffle_split(sub, group_col='TRB_1_cdr3', val_split=0.20, random_seed=random_seed)
    adata_tmp.obs['set'] = 'train'
    adata_tmp.obs.loc[non_sub.obs.index, 'set'] = '-'
    adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
    adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'val'])]
    splits_covid[f'split_{i}'] = adata_tmp.copy()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.32it/s]


In [26]:
path_base = f'../mvTCR/data/tessa/covid/'
if not os.path.exists(path_base):
    os.mkdir(path_base)

for split, adata in tqdm(splits_covid.items()):
    df_tcr = adata.obs[['TRB_1_cdr3', 'T_cells', 'reactivity' , 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['TRB_1_cdr3']]

    df_tcr = df_tcr[['cdr3', 'T_cells', 'reactivity', 'set']]
    df_tcr.index.name = 'contig_id'

    df_tcr.to_csv(path_base+f'{split}_tcrs_atlas.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['TRB_1_cdr3']]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.80it/s]


In [27]:
path_base = f'../mvTCR/data/tessa/covid/'

for split, adata in tqdm(splits_covid.items()):
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)

    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts = df_counts.transpose()
    df_counts.to_csv(path_base+f'{split}_scRNA_atlas.csv')

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:54<00:00, 10.84s/it]


In [28]:
del splits_covid