# Data Preperation for TESSA evaluation

Here, we perpare our data to the format of TESSA from:

Zhang Z, Xiong D, Wang X, Liu H, Wang T. Mapping the functional landscape of T cell receptor repertoires by single-T cell transcriptomics. Nat Methods. 2021.

https://www.nature.com/articles/s41592-020-01020-3

Clone the Github repository from https://github.com/jcao89757/TESSA to the folder 'baseline'.

In [1]:
import scanpy as sc
import pandas as pd
import os
import sys

sys.path.append('../mvTCR')
import tcr_embedding as tcr
from tcr_embedding.utils_preprocessing import group_shuffle_split
import config.constants_10x as const

# Transform data

In [2]:
path_10x_adata = '../mvTCR/data/10x_CD8TC/v6_supervised.h5ad'
adata = sc.read_h5ad(path_10x_adata)

adatas_per_donor = []
for i in range(1, 6):
    if i == 5:
        new_ad = adata
    else:
        new_ad = adata[adata.obs['donor']== f'donor_{i}']
    new_ad.obs['binding_name'] = new_ad.obs['binding_name'].astype(str)
    new_ad = new_ad[(new_ad.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS))]
    adatas_per_donor.append(new_ad.copy())

  new_ad.obs['binding_name'] = new_ad.obs['binding_name'].astype(str)


In [3]:
len(adatas_per_donor)

5

In [4]:
splits = {}

for j, adata_donor in enumerate(adatas_per_donor):
    splits[f'donor_{j+1}' if j < 4 else 'full'] = {}
    adata_donor.obs['group_col'] = [seq[1:-1] for seq in adata_donor.obs['IR_VDJ_1_junction_aa']]
    for i in range(0, 5):
        random_seed = i
        adata_tmp = adata_donor.copy()
        train_val, test = group_shuffle_split(adata_tmp, group_col='group_col', val_split=0.20, random_seed=random_seed)
        train, val = group_shuffle_split(train_val, group_col='group_col', val_split=0.25, random_seed=random_seed)
        
        adata_tmp.obs['set'] = 'train'
        adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
        adata_tmp.obs.loc[test.obs.index, 'set'] = 'test'
        
        adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'test'])]
        splits[f'donor_{j+1}' if j < 4 else 'full'][f'split_{i}'] = adata_tmp.copy()

Write the TCR sequences to seperate csv file for each donor:

In [5]:
for donor, ad_dict in splits.items():
    for split, adata in ad_dict.items():
        path_base = f'../mvTCR/data/tessa/10x/{donor}/'
        if not os.path.exists(path_base):
            os.mkdir(path_base)

        df_tcr = adata.obs[['IR_VDJ_1_junction_aa', 'binding_name', 'set']]
        df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]

        df_tcr = df_tcr[['cdr3', 'binding_name', 'set']]
        df_tcr.index.name = 'contig_id'

        df_tcr_train = df_tcr[df_tcr['set']=='train']
        df_tcr_test = df_tcr[df_tcr['set']=='test']

        df_tcr_train.to_csv(path_base+f'{split}_tcrs_atlas.csv')
        df_tcr_test.to_csv(path_base+f'{split}_tcrs_query.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_junction_aa']]


Write the scRNA matrix to seperate csv file for each donor:

In [6]:
for donor, ad_dict in splits.items():
    for split, adata in ad_dict.items():
        path_base = f'../mvTCR/data/tessa/10x/{donor}/'
        count_mat = adata.X.A
        df_counts = pd.DataFrame(count_mat)

        df_counts.index = adata.obs.index
        df_counts.index.name = ''
        df_counts.columns = adata.var.index

        df_counts_train = df_counts[adata.obs['set']=='train']
        df_counts_test = df_counts[adata.obs['set']=='test']


        df_counts_train = df_counts_train.transpose()
        df_counts_test = df_counts_test.transpose()

        df_counts_train.to_csv(path_base+f'{split}_scRNA_atlas.csv')
        df_counts_test.to_csv(path_base+f'{split}_scRNA_query.csv')

## Transform the Covid Dataset

In [7]:
def extract_reactivity(value):
    if 'un-reactive' in value:
        return 'un-reactive'
    if 'CD8' in value:
        return 'CD8 reactive'
    if 'CD4' in value:
        return 'CD4 reactive'

In [8]:
path_covid_adata = '../mvTCR/data/Covid/04_covid_highly_var_5000.h5ad'
adata = sc.read_h5ad(path_covid_adata)
adata.obs['reactivity'] = [extract_reactivity(x) for x in adata.obs['reactive_combined']]

In [9]:
splits_covid = {}

for i in range(0, 5):
    random_seed = i
    adata_tmp = adata.copy()
    sub, non_sub = group_shuffle_split(adata_tmp, group_col='TRB_1_cdr3', val_split=0.2, random_seed=random_seed)
    train, val = group_shuffle_split(sub, group_col='TRB_1_cdr3', val_split=0.20, random_seed=random_seed)
    adata_tmp.obs['set'] = 'train'
    adata_tmp.obs.loc[non_sub.obs.index, 'set'] = '-'
    adata_tmp.obs.loc[val.obs.index, 'set'] = 'val'
    adata_tmp = adata_tmp[adata_tmp.obs['set'].isin(['train', 'val'])]
    splits_covid[f'split_{i}'] = adata_tmp.copy()

In [10]:
path_base = f'../mvTCR/data/tessa/covid/'
if not os.path.exists(path_base):
    os.mkdir(path_base)

for split, adata in splits_covid.items():
    df_tcr = adata.obs[['TRB_1_cdr3', 'T_cells', 'reactivity' , 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['TRB_1_cdr3']]

    df_tcr = df_tcr[['cdr3', 'T_cells', 'reactivity', 'set']]
    df_tcr.index.name = 'contig_id'

    df_tcr.to_csv(path_base+f'{split}_tcrs_atlas.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['TRB_1_cdr3']]


In [11]:
path_base = f'../mvTCR/data/tessa/covid/'

for split, adata in splits_covid.items():
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)

    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts = df_counts.transpose()
    df_counts.to_csv(path_base+f'{split}_scRNA_atlas.csv')