# mvTCR - MoE
We trained our model (version moe) purely on rna and beta chain to enable a fair comparison to TESSA. As previously, we splitted the data into training, validation, and testing to seperate clonotypes, but this time clonotypes were defined as cells with the same CDR3beta chain instead of identical CDR3beta and CDR3alpha chain.

In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append('../mvTCR/')
import tcr_embedding.utils_training as utils
import config.constants_10x as const

from tcr_embedding.utils_preprocessing import stratified_group_shuffle_split, group_shuffle_split
from tcr_embedding.evaluation.Imputation import run_imputation_evaluation
from tcr_embedding.evaluation.Clustering import run_clustering_evaluation
from tcr_embedding.evaluation.kNN import run_knn_within_set_evaluation
from tcr_embedding.evaluation.WrapperFunctions import get_model_prediction_function

## Helper Functions

In [3]:
def load_model(adata, dataset, split):
    path_model = f'saved_models/journal/beta_only/{dataset}/'
    path_model += f'{dataset}_moe_split_{split}.pt'
    model = utils.load_model(adata, path_model)
    return model

In [4]:
def load_10x_data(donor, split):
    adata = utils.load_data('10x')
    adata = adata[adata.obs['donor'] == f'donor_{donor}']
    adata = adata[adata.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS)]

    random_seed = split

    adata.obs['group_col'] = [seq[1:-1] for seq in adata.obs['IR_VDJ_1_junction_aa']]
    train_val, test = group_shuffle_split(adata, group_col='group_col', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='group_col', val_split=0.25, random_seed=random_seed)


    adata.obs['set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'
    adata = adata[adata.obs['set'].isin(['train', 'test'])]
    return adata

In [5]:
def load_covid_data(split):
    adata = utils.load_data('covid')
    random_seed = split
    
    sub, non_sub = group_shuffle_split(adata, group_col='TRB_1_cdr3', val_split=0.2, random_seed=random_seed)
    train, val = group_shuffle_split(sub, group_col='TRB_1_cdr3', val_split=0.20, random_seed=random_seed)

    adata.obs['set'] = 'train'
    adata.obs.loc[non_sub.obs.index, 'set'] = '-'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata = adata[adata.obs['set'].isin(['train', 'val'])]
    return adata

## 10x - Donor 1

In [6]:
donor = 1
dataset = f'10x_{donor}'
metadata = ['binding_name', 'clonotype', 'donor']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_10x_data(donor, split)
    
    print(f'split: {split}')
    model = load_model(data, dataset, split)
    test_embedding_func = get_model_prediction_function(model)
    for source in ['test']:
        summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                    label_pred='binding_name')
        result = summary['knn']['weighted avg']['f1-score']

        model_names.append('moe_beta')
        splits.append(split)
        metrics.append(f'Prediction {source}')
        scores.append(result)

    best_nmi = -99
    for resolution in [0.01, 0.1, 1.0]:
        cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                           cluster_params={'resolution': resolution, 'num_neighbors': 5})
        best_nmi = max(cluster_result['NMI'], best_nmi)
    model_names.append('moe_beta')
    splits.append(split)
    metrics.append('NMI')
    scores.append(best_nmi)

results_10x_1 = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_10x_1 = pd.DataFrame(results_10x_1)
results_10x_1.to_csv('../results/performance_10x_donor_1_beta.csv')
results_10x_1

split: 0
split: 1
split: 2
split: 3
split: 4


Unnamed: 0,model,split,metrics,scores
0,moe_beta,0,Prediction test,0.707285
1,moe_beta,0,NMI,0.470503
2,moe_beta,1,Prediction test,0.788823
3,moe_beta,1,NMI,0.64545
4,moe_beta,2,Prediction test,0.609094
5,moe_beta,2,NMI,0.590328
6,moe_beta,3,Prediction test,0.766852
7,moe_beta,3,NMI,0.667587
8,moe_beta,4,Prediction test,0.310098
9,moe_beta,4,NMI,0.548281


## 10x - Donor 2

In [7]:
donor = 2
dataset = f'10x_{donor}'
metadata = ['binding_name', 'clonotype', 'donor']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_10x_data(donor, split)

    print(f'split: {split}')
    model = load_model(data, dataset, split)
    test_embedding_func = get_model_prediction_function(model)
    for source in ['test']:
        summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                    label_pred='binding_name')
        result = summary['knn']['weighted avg']['f1-score']

        model_names.append('moe_beta')
        splits.append(split)
        metrics.append(f'Prediction {source}')
        scores.append(result)

    best_nmi = -99
    for resolution in [0.01, 0.1, 1.0]:
        cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                           cluster_params={'resolution': resolution, 'num_neighbors': 5})
        best_nmi = max(cluster_result['NMI'], best_nmi)
    model_names.append('moe_beta')
    splits.append(split)
    metrics.append('NMI')
    scores.append(best_nmi)
        
results_10x_2 = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_10x_2 = pd.DataFrame(results_10x_2)
results_10x_2.to_csv('../results/performance_10x_donor_2_beta.csv')
results_10x_2

split: 0
split: 1
split: 2
split: 3
split: 4


Unnamed: 0,model,split,metrics,scores
0,moe_beta,0,Prediction test,0.711281
1,moe_beta,0,NMI,0.320359
2,moe_beta,1,Prediction test,0.830582
3,moe_beta,1,NMI,0.381781
4,moe_beta,2,Prediction test,0.73039
5,moe_beta,2,NMI,0.341581
6,moe_beta,3,Prediction test,0.741946
7,moe_beta,3,NMI,0.353716
8,moe_beta,4,Prediction test,0.776759
9,moe_beta,4,NMI,0.435814


## Covid

In [8]:
metadata = ['T_cells', 'clonotype', 'responsive']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_covid_data(split)

    print(f'split: {split}')
    model = load_model(data, 'Covid', split)
    test_embedding_func = get_model_prediction_function(model)

    best_nmi = -99
    for resolution in [0.01, 0.1, 1.0]:
        cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='T_cells', 
                                           cluster_params={'resolution': resolution, 'num_neighbors': 5})
        best_nmi = max(cluster_result['NMI'], best_nmi)
    model_names.append('moe_beta')
    splits.append(split)
    metrics.append('NMI_cell_type')
    scores.append(best_nmi)

    best_nmi = -99
    for resolution in [0.01, 0.1, 1.0]:
        cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='responsive', 
                                           cluster_params={'resolution': resolution, 'num_neighbors': 5})
        best_nmi = max(cluster_result['NMI'], best_nmi)
    model_names.append('moe_beta')
    splits.append(split)
    metrics.append('NMI_reactivity')
    scores.append(best_nmi)
        
results_covid = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_covid = pd.DataFrame(results_covid)
results_covid.to_csv('../results/performance_covid_beta.csv')

split: 0
split: 1
split: 2
split: 3
split: 4


## Write to Supplementary Material

In [9]:
path_out = '../results/supplement/S1_benchmarking.xlsx'

with pd.ExcelWriter(path_out, mode='a') as writer:  
    results_10x_1.to_excel(writer, sheet_name='10x_beta_donor1')
    results_10x_2.to_excel(writer, sheet_name='10x_beta_donor2')
    results_covid.to_excel(writer, sheet_name='Covid_beta')