# Data Preperation for TESSA evaluation

Here, we perpare our data to the format of TESSA from:

Zhang Z, Xiong D, Wang X, Liu H, Wang T. Mapping the functional landscape of T cell receptor repertoires by single-T cell transcriptomics. Nat Methods. 2021.

https://www.nature.com/articles/s41592-020-01020-3

Clone the Github repository from https://github.com/jcao89757/TESSA to the folder 'baseline'.

In [1]:
import scanpy as sc
import pandas as pd
import os
import sys

sys.path.append('..')
import tcr_embedding as tcr

# Transform data

In [2]:
path_10x_adata = '../data/10x_CD8TC/v6_supervised.h5ad'
adata = sc.read_h5ad(path_10x_adata)

adatas_per_donor = []
for i in range(1, 3):
    new_ad = adata[adata.obs['donor']== f'donor_{i}']
    new_ad.obs['binding_name'] = new_ad.obs['binding_name'].astype(str)
    new_ad = new_ad[(new_ad.obs['binding_name'].isin(tcr.constants.DONOR_SPECIFIC_ANTIGENS[str(i)]))]
    adatas_per_donor.append(new_ad)

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


Write the TCR sequences to seperate csv file for each donor:

In [3]:
for idx, adata in enumerate(adatas_per_donor):
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    if not os.path.exists(path_base):
        os.mkdir(path_base)
    
    df_tcr = adata.obs[['IR_VDJ_1_cdr3', 'binding_name', 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_cdr3']]
    
    df_tcr = df_tcr[['cdr3', 'binding_name', 'set']]
    df_tcr.index.name = 'contig_id'
    
    df_tcr_train = df_tcr[df_tcr['set']=='train']
    df_tcr_test = df_tcr[df_tcr['set']=='test']
    
    df_tcr_train.to_csv(path_base+'tcrs_atlas.csv')
    df_tcr_test.to_csv(path_base+'tcrs_query.csv')

Write the scRNA matrix to seperate csv file for each donor:

In [4]:
for idx, adata in enumerate(adatas_per_donor):
    print(idx)
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)
    
    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts_train = df_counts[adata.obs['set']=='train']
    df_counts_test = df_counts[adata.obs['set']=='test']

    
    df_counts_train = df_counts_train.transpose()
    df_counts_test = df_counts_test.transpose()
    
    df_counts_train.to_csv(path_base+'scRNA_atlas.csv')
    df_counts_test.to_csv(path_base+'scRNA_query.csv')

0
1


Create a test dataset to set everything up fast.

In [5]:
path_test = '../data/tessa/10x/test/'
if not os.path.exists(path_test):
    os.mkdir(path_test)

In [6]:
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_atlas.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_atlas.csv', index=False)
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_query.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_query.csv', index=False)

In [7]:
df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_atlas.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_atlas.csv')

df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_query.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_query.csv')

In [8]:
df_scRNA_test_atlas = pd.read_csv('../data/tessa/10x/test/scRNA_atlas.csv')
df_scRNA_test_atlas

Unnamed: 0.1,Unnamed: 0,AACTTTCAGTAACCCT-1-donor_1,ACACCCTAGGAGTTTA-1-donor_1,ACGAGGAGTCAATGTC-1-donor_1,ACGATACGTATATGGA-1-donor_1,ACGCAGCTCCCAAGAT-1-donor_1,ACGCCAGGTGCGATAG-1-donor_1,ACGCCGACAGGATTGG-1-donor_1,ACTTACTGTAAGAGAG-1-donor_1,ACTTACTTCCAGATCA-1-donor_1,...,ACCAGTAAGGTGCTTT-2-donor_1,ACCAGTAGTGTAACGG-2-donor_1,ACCGTAAAGGCTCAGA-2-donor_1,ACCGTAATCAGCACAT-2-donor_1,ACCTTTAAGCGGATCA-2-donor_1,ACGATACTCTCGCATC-2-donor_1,ACGATGTAGTAGTGCG-2-donor_1,ACTGAGTGTGCAACGA-2-donor_1,ACTTTCAGTATATGAG-2-donor_1,AGCCTAATCTCGCATC-2-donor_1
0,AL627309.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,AL669831.5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.836013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.142315
2,LINC00115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,TTLL10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,TNFRSF18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,MT-ND5,4.167793,3.994534,3.208457,2.279626,3.048978,4.124494,2.885944,3.296800,3.690040,...,4.162903,3.552438,3.390612,1.576661,3.149135,3.403953,3.064637,4.596903,3.261819,4.071049
4996,MT-ND6,2.999318,3.068062,0.991687,1.983268,0.000000,2.152293,1.228824,2.198114,1.969240,...,1.836013,2.742635,1.852897,0.823996,1.784917,1.700059,1.711862,3.369035,2.315123,2.255081
4997,MT-CYB,5.542405,5.200715,4.206710,3.260624,4.151259,5.669789,4.802877,4.291446,4.169875,...,4.314828,5.320029,4.003375,3.766453,4.218738,4.643508,3.556618,5.236459,3.494736,4.908551
4998,BX004987.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [9]:
df_tcrs_test_atlas = pd.read_csv('../data/tessa/10x/test/tcrs_atlas.csv')
df_tcrs_test_atlas

Unnamed: 0,contig_id,cdr3,binding_name,set
0,AACTTTCAGTAACCCT-1-donor_1,ASSWGGGSHYGYT,A1101_IVTDFSVIK_EBNA-3B_EBV_binder,train
1,ACACCCTAGGAGTTTA-1-donor_1,ASSLAGPAADRPYNEQF,A0301_KLGGALQAK_IE-1_CMV_binder,train
2,ACGAGGAGTCAATGTC-1-donor_1,ASSIRSAYEQY,A0201_GILGFVFTL_Flu-MP_Influenza_binder,train
3,ACGATACGTATATGGA-1-donor_1,ASSWGGGSHYGYT,A1101_IVTDFSVIK_EBNA-3B_EBV_binder,train
4,ACGCAGCTCCCAAGAT-1-donor_1,ASSPRDRERGEQY,A0301_KLGGALQAK_IE-1_CMV_binder,train
...,...,...,...,...
95,ACGATACTCTCGCATC-2-donor_1,ASSPRDRERGEQY,A0301_KLGGALQAK_IE-1_CMV_binder,train
96,ACGATGTAGTAGTGCG-2-donor_1,ASSLYSATGELF,A1101_AVFDRKSDAK_EBNA-3B_EBV_binder,train
97,ACTGAGTGTGCAACGA-2-donor_1,ASSIRSSYEQY,A0201_GILGFVFTL_Flu-MP_Influenza_binder,train
98,ACTTTCAGTATATGAG-2-donor_1,ASSWGGGSHYGYT,A1101_IVTDFSVIK_EBNA-3B_EBV_binder,train


In [10]:
df_scRNA_test_atlas = pd.read_csv('../data/tessa/10x/donor_2/scRNA_atlas.csv')
df_scRNA_test_atlas

Unnamed: 0.1,Unnamed: 0,AAACGGGTCAGTACGT-1-donor_2,AAAGCAACACCGAATT-1-donor_2,AAAGTAGCACCGAATT-1-donor_2,AACACGTCATTAACCG-1-donor_2,AACCGCGAGATGCCTT-1-donor_2,AAGACCTGTTCCACAA-1-donor_2,AAGCCGCGTTGCGTTA-1-donor_2,ACACCGGCAGGGTATG-1-donor_2,ACACTGAGTCTAGCCG-1-donor_2,...,TGACTTTCAGAGTGTG-40-donor_2,TGAGGGATCTGCTGTC-40-donor_2,TGCCCTATCATGTCCC-40-donor_2,TGCGTGGGTTCCCGAG-40-donor_2,TGGGCGTGTGAAGGCT-40-donor_2,TGTATTCTCGCCTGTT-40-donor_2,TGTGGTAGTCTCCACT-40-donor_2,TTCGAAGTCGAGGTAG-40-donor_2,TTGACTTTCGACGGAA-40-donor_2,TTTCCTCGTCCGTGAC-40-donor_2
0,AL627309.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,AL669831.5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,LINC00115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.166254,0.000000,0.000000,1.315411
3,TTLL10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,TNFRSF18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,MT-ND5,3.058599,2.061024,3.015699,3.156587,1.702135,1.837787,2.077344,1.761375,2.524212,...,4.063984,2.311840,2.270379,2.006571,3.071261,3.587930,3.593445,2.885032,3.929156,3.240110
4996,MT-ND6,0.000000,0.000000,1.766585,2.216840,1.383899,0.000000,0.000000,1.761375,1.721413,...,2.282472,0.000000,1.915261,1.146107,0.000000,1.547767,1.690075,1.228141,2.951381,1.864477
4997,MT-CYB,4.229152,3.171646,3.621170,3.735081,3.825516,3.935444,3.861886,4.226631,3.985513,...,4.438033,3.558163,4.025222,3.502214,4.336244,3.730755,3.947945,3.549855,5.014575,4.700946
4998,BX004987.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [11]:
df_tcrs_test_atlas = pd.read_csv('../data/tessa/10x/donor_2/tcrs_atlas.csv')
df_tcrs_test_atlas

Unnamed: 0,contig_id,cdr3,binding_name,set
0,AAACGGGTCAGTACGT-1-donor_2,ASSFRSSETQY,A0201_GILGFVFTL_Flu-MP_Influenza_binder,train
1,AAAGCAACACCGAATT-1-donor_2,ASSLGTSGGTGELF,B0801_RAKFKQLL_BZLF1_EBV_binder,train
2,AAAGTAGCACCGAATT-1-donor_2,ASGRLSYNEQF,B0801_RAKFKQLL_BZLF1_EBV_binder,train
3,AACACGTCATTAACCG-1-donor_2,ASSFSGNTGELF,B0801_RAKFKQLL_BZLF1_EBV_binder,train
4,AACCGCGAGATGCCTT-1-donor_2,ASSIRSSYEQY,A0201_GILGFVFTL_Flu-MP_Influenza_binder,train
...,...,...,...,...
19855,TGTATTCTCGCCTGTT-40-donor_2,ASSIRSSYEQY,A0201_GILGFVFTL_Flu-MP_Influenza_binder,train
19856,TGTGGTAGTCTCCACT-40-donor_2,ASSEGGVETQY,A0301_KLGGALQAK_IE-1_CMV_binder,train
19857,TTCGAAGTCGAGGTAG-40-donor_2,ASSFSGNTGELF,B0801_RAKFKQLL_BZLF1_EBV_binder,train
19858,TTGACTTTCGACGGAA-40-donor_2,ASSRDSSANEQF,A0301_KLGGALQAK_IE-1_CMV_binder,train
