# Data Preperation for TESSA evaluation

Here, we perpare our data to the format of TESSA from:

Zhang Z, Xiong D, Wang X, Liu H, Wang T. Mapping the functional landscape of T cell receptor repertoires by single-T cell transcriptomics. Nat Methods. 2021.

https://www.nature.com/articles/s41592-020-01020-3

Clone the Github repository from https://github.com/jcao89757/TESSA to the folder 'baseline'.

In [1]:
import scanpy as sc
import pandas as pd
import os
import sys

sys.path.append('..')
import tcr_embedding as tcr

# Transform data

In [2]:
path_10x_adata = '../data/10x_CD8TC/v6_supervised.h5ad'
adata = sc.read_h5ad(path_10x_adata)

adatas_per_donor = []
for i in range(1, 3):
    new_ad = adata[adata.obs['donor']== f'donor_{i}']
    adatas_per_donor.append(new_ad)

Write the TCR sequences to seperate csv file for each donor:

In [29]:
for idx, adata in enumerate(adatas_per_donor):
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    if not os.path.exists(path_base):
        os.mkdir(path_base)

    adata.obs['binding_name'] = adata.obs['binding_name'].astype(str)
    adata = adata[(adata.obs['binding_name'].isin(tcr.constants.DONOR_SPECIFIC_ANTIGENS[str(idx+1)]))]

    
    df_tcr = adata.obs[['IR_VDJ_1_cdr3', 'binding_name', 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_cdr3']]
    
    df_tcr = df_tcr[['cdr3', 'binding_name', 'set']]
    df_tcr.index.name = 'contig_id'
    
    df_tcr_train = df_tcr[df_tcr['set']=='train']
    df_tcr_test = df_tcr[df_tcr['set']=='test']
    
    df_tcr_train.to_csv(path_base+'tcrs_atlas.csv')
    df_tcr_test.to_csv(path_base+'tcrs_query.csv')



Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


Write the scRNA matrix to seperate csv file for each donor:

In [30]:
for idx, adata in enumerate(adatas_per_donor):
    print(idx)
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)
    
    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts_train = df_counts[adata.obs['set']=='train']
    df_counts_test = df_counts[adata.obs['set']=='test']

    
    df_counts_train = df_counts_train.transpose()
    df_counts_test = df_counts_test.transpose()
    
    df_counts_train.to_csv(path_base+'scRNA_atlas.csv')
    df_counts_test.to_csv(path_base+'scRNA_query.csv')

0
1


Create a test dataset to set everything up fast.

In [31]:
path_test = '../data/tessa/10x/test/'
if not os.path.exists(path_test):
    os.mkdir(path_test)

In [32]:
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_atlas.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_atlas.csv', index=False)
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_query.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_query.csv', index=False)

In [38]:
df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_atlas.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_atlas.csv')

df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_query.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_query.csv')