# Comparision to TESSA

Here, we evalute the performance of TESSA from:

Zhang Z, Xiong D, Wang X, Liu H, Wang T. Mapping the functional landscape of T cell receptor repertoires by single-T cell transcriptomics. Nat Methods. 2021.

https://www.nature.com/articles/s41592-020-01020-3

Clone the Github repository from https://github.com/jcao89757/TESSA to the folder 'baseline'.

In [1]:
import scanpy as sc
import pandas as pd
import os
import sys

sys.path.append('..')
import tcr_embedding as tcr

ModuleNotFoundError: No module named 'scanpy'

# Transform data

In [28]:
path_10x_adata = '../data/10x_CD8TC/v6_supervised.h5ad'
adata = sc.read_h5ad(path_10x_adata)

adatas_per_donor = []
for i in range(1, 3):
    new_ad = adata[adata.obs['donor']== f'donor_{i}']
    adatas_per_donor.append(new_ad)

Write the TCR sequences to seperate csv file for each donor:

In [29]:
for idx, adata in enumerate(adatas_per_donor):
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    if not os.path.exists(path_base):
        os.mkdir(path_base)

    adata.obs['binding_name'] = adata.obs['binding_name'].astype(str)
    adata = adata[(adata.obs['binding_name'].isin(tcr.constants.DONOR_SPECIFIC_ANTIGENS[str(idx+1)]))]

    
    df_tcr = adata.obs[['IR_VDJ_1_cdr3', 'binding_name', 'set']]
    df_tcr['cdr3'] = [seq[1:-1] for seq in df_tcr['IR_VDJ_1_cdr3']]
    
    df_tcr = df_tcr[['cdr3', 'binding_name', 'set']]
    df_tcr.index.name = 'contig_id'
    
    df_tcr_train = df_tcr[df_tcr['set']=='train']
    df_tcr_test = df_tcr[df_tcr['set']=='test']
    
    df_tcr_train.to_csv(path_base+'tcrs_atlas.csv')
    df_tcr_test.to_csv(path_base+'tcrs_query.csv')



Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


Write the scRNA matrix to seperate csv file for each donor:

In [30]:
for idx, adata in enumerate(adatas_per_donor):
    print(idx)
    path_base = f'../data/tessa/10x/donor_{idx+1}/'
    count_mat = adata.X.A
    df_counts = pd.DataFrame(count_mat)
    
    df_counts.index = adata.obs.index
    df_counts.index.name = ''
    df_counts.columns = adata.var.index

    df_counts_train = df_counts[adata.obs['set']=='train']
    df_counts_test = df_counts[adata.obs['set']=='test']

    
    df_counts_train = df_counts_train.transpose()
    df_counts_test = df_counts_test.transpose()
    
    df_counts_train.to_csv(path_base+'scRNA_atlas.csv')
    df_counts_test.to_csv(path_base+'scRNA_query.csv')

0
1


Create a test dataset to set everything up fast.

In [31]:
path_test = '../data/tessa/10x/test/'
if not os.path.exists(path_test):
    os.mkdir(path_test)

In [32]:
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_atlas.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_atlas.csv', index=False)
df_tcr_test = pd.read_csv('../data/tessa/10x/donor_1/tcrs_query.csv')
df_tcr_test = df_tcr_test.head(100)
df_tcr_test.to_csv(path_test + 'tcrs_query.csv', index=False)

In [38]:
df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_atlas.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_atlas.csv')

df_scRNA_test = pd.read_csv('../data/tessa/10x/donor_1/scRNA_query.csv', index_col=0)
columns = df_scRNA_test.columns.tolist()[:100]
df_scRNA_test = df_scRNA_test[columns]
df_scRNA_test.to_csv(path_test + 'scRNA_query.csv')

import numpy as np
n = 200
data = np.random.rand(n, 100)

df_scRNA2 = pd.DataFrame(data=data, columns=df_scRNA_test.columns[:100])
df_scRNA2.index = [f'col_{i}' for i in range(n)]
df_scRNA2.to_csv('../data/tessa/10x/test_scRNA_donor_1.csv')

## Embedd Sequences using TESSA

In [8]:
import os
import pandas as pd

In [2]:
settings_ae = {
    'tcr': '../data/tessa/10x/test_tcrs_donor_1.csv',
    'model': './TESSA/BriseisEncoder/TrainedEncoder.h5',
    'embeding_vectors': './TESSA/BriseisEncoder/Atchley_factors.csv',
    'output_TCR': './tmp/tessa_tcr_embedding.csv',
    'output_log': './tmp/tessa_log.log',
}

In [3]:
command_ae = 'python ./TESSA/BriseisEncoder/BriseisEncoder.py' 

for key, value in settings_ae.items():
    command_ae += f' -{key} {value}'
command_ae

'python ./TESSA/BriseisEncoder/BriseisEncoder.py -tcr ../data/tessa/10x/test_tcrs_donor_1.csv -model ./TESSA/BriseisEncoder/TrainedEncoder.h5 -embeding_vectors ./TESSA/BriseisEncoder/Atchley_factors.csv -output_TCR ./tmp/tessa_tcr_embedding.csv -output_log ./tmp/tessa_log.log'

In [8]:
os.system(command_ae)

0

In [21]:
settings_tessa = {
    'code': 'Rscript ./TESSA/Tessa/real_data.R',
    'code_base': './TESSA/Tessa',
    'gene_expression': '../data/tessa/10x/test_scRNA_donor_1.csv',
    'tcr_ebemddings': './tmp/tessa_tcr_embedding.csv',
    'tcr_file': '../data/tessa/10x/test_tcrs_donor_1.csv',
    'save_tessa': 'tmp/res/',
    'different_donors': 'FALSE',
    'predefined_b': 'NA'
}

In [22]:
command_tessa = ' '.join(settings_tessa.values())
command_tessa

'Rscript ./TESSA/Tessa/real_data.R ./TESSA/Tessa ../data/tessa/10x/test_scRNA_donor_1.csv ./tmp/tessa_tcr_embedding.csv ../data/tessa/10x/test_tcrs_donor_1.csv tmp/res/ FALSE NA'

In [23]:
os.system(command_tessa)

0

In [25]:
settings_full = {
    'tcr': '../data/tessa/10x/test_tcrs_donor_1.csv',
    'model': './TESSA/BriseisEncoder/TrainedEncoder.h5',
    'embeding_vectors': './TESSA/BriseisEncoder/Atchley_factors.csv',
    'output_TCR': './tmp/tessa_tcr_embedding.csv',
    'output_VJ': './tmp/tessa_vj_embedding.csv',
    'output_log': './tmp/tessa_log.log',
    'exp': '../data/tessa/10x/test_scRNA_donor_1.csv',
    'output_tessa': 'tmp/res/', 
    'within_sample_networks': 'FALSE',

}

In [26]:
command_full = 'python ./TESSA/Tessa_main.py'
for key, value in settings_full.items():
    command_full += f' -{key} {value}'
command_full

'python ./TESSA/Tessa_main.py -tcr ../data/tessa/10x/test_tcrs_donor_1.csv -model ./TESSA/BriseisEncoder/TrainedEncoder.h5 -embeding_vectors ./TESSA/BriseisEncoder/Atchley_factors.csv -output_TCR ./tmp/tessa_tcr_embedding.csv -output_VJ ./tmp/tessa_vj_embedding.csv -output_log ./tmp/tessa_log.log -exp ../data/tessa/10x/test_scRNA_donor_1.csv -output_tessa tmp/res/ -within_sample_networks FALSE'

In [27]:
os.system(command_full)

0

## Evaluate Embedding

### TCR Encoder

In [48]:
unweighted_dist = pd.read_csv('./tmp/tessa_tcr_embedding.csv', index_col=0)
unweighted_dist = unweighted_dist.values
unweighted_dist

array([[ 0.18576838,  0.01204199, -0.20750672, ...,  0.60151404,
         0.03758714,  0.38953918],
       [-0.88813615, -0.68477386,  0.41438967, ..., -0.14791705,
         0.2573461 ,  0.07539734],
       [ 0.28786305,  0.39245352,  0.41624597, ...,  0.22307457,
        -0.8587716 ,  0.03321042],
       ...,
       [-0.79423136, -0.4949068 , -0.01769258, ..., -0.16110949,
         0.15286483, -0.07022017],
       [ 0.735304  , -0.47217894, -0.18631631, ...,  0.27535847,
         0.00402249, -0.22769676],
       [ 0.77998984, -0.7295521 , -0.4799435 , ...,  0.5050619 ,
         0.18753207,  0.12268431]])

### TESSA

In [34]:
import os
import numpy as np
import pandas as pd
os.environ['R_HOME'] = 'C:/Users/felix.drost/Anaconda3/envs/tessa/Lib/R'

import rpy2.robjects as rob

In [56]:
def get_weighted_tessa_embedding(path_tcr_embedding, path_results):
    res = rob.r['load'](path_results)
    b = rob.r['tessa_results'][0]
    b = np.array(b)
    
    unweighted_dist = pd.read_csv(path_tcr_embedding, index_col=0)
    unweighted_dist = unweighted_dist.values

    weighted_dist = unweighted_dist * b 
    return weighted_dist
get_weighted_tessa_embedding('./tmp/tessa_tcr_embedding.csv', 'tmp/res/tessa_final.RData')

(100, 30)

In [None]:
get_imputation_scores(embedding_atlas, embedding_query, label_atlas, label_embedding, num_neighbors=5)