In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append('../mvTCR/')
import tcr_embedding.utils_training as utils
import config.constants_10x as const

from tcr_embedding.utils_preprocessing import stratified_group_shuffle_split, group_shuffle_split
from tcr_embedding.evaluation.Imputation import run_imputation_evaluation
from tcr_embedding.evaluation.Clustering import run_clustering_evaluation
from tcr_embedding.evaluation.kNN import run_knn_within_set_evaluation
from tcr_embedding.evaluation.WrapperFunctions import get_model_prediction_function

In [3]:
def load_model(adata, dataset, split, model, donor=''):
    if donor != '':
        path_model = f'saved_models/journal/10x/splits/donor_{donor}/{model}/{model}_'
    else:
        path_model = f'saved_models/journal/Fischer/{model}/{model}_{dataset}_'
    if donor != '':
        path_model += f'donor_{donor}_'
    if split != 'full':
        path_model += f'split_{split}'
    else:
        path_model += 'full'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model
    

In [4]:
def load_10x_data(donor, split):
    adata = utils.load_data('10x')
    adata = adata[adata.obs['donor'] == f'donor_{donor}']
    adata = adata[adata.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS)]
    if split != 'full':
        random_seed = split

        train_val, test = group_shuffle_split(adata, group_col='clonotype', val_split=0.20, random_seed=random_seed)
        train, val = group_shuffle_split(train_val, group_col='clonotype', val_split=0.25, random_seed=random_seed)

        adata.obs['set'] = 'train'
        adata.obs.loc[val.obs.index, 'set'] = 'val'
        adata.obs.loc[test.obs.index, 'set'] = 'test'
        adata = adata[adata.obs['set'].isin(['train', 'val', 'test'])]

    return adata

In [5]:
def load_covid_data(split):
    adata = utils.load_data('covid')
    random_seed = split
    sub, non_sub = group_shuffle_split(adata, group_col='clonotype', val_split=0.2, random_seed=random_seed)
    train, val = group_shuffle_split(sub, group_col='clonotype', val_split=0.20, random_seed=random_seed)
    
    adata.obs['set'] = 'train'
    adata.obs.loc[non_sub.obs.index, 'set'] = '-'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata = adata[adata.obs['set'].isin(['train', 'val'])]
    return adata

In [6]:
data = load_10x_data(1, 0)

In [7]:
dataset = '10x'
donor = 1
metadata = ['binding_name', 'clonotype', 'donor']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_10x_data(donor, split)
    for model_name in ['concat', 'moe', 'poe', 'tcr', 'rna']:
        print(f'split: {split},  model: {model_name}')
        model = load_model(data, dataset, split, model_name, donor)
        test_embedding_func = get_model_prediction_function(model)
        for source in ['test']:  #, 'val']:
            summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                        label_pred='binding_name')
            result = summary['knn']['weighted avg']['f1-score']
            
            model_names.append(model_name)
            splits.append(split)
            metrics.append(f'Prediction {source}')
            scores.append(result)
        
        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        model_names.append(model_name)
        splits.append(split)
        metrics.append('NMI')
        scores.append(best_nmi)        

results_10x_1 = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_10x_1 = pd.DataFrame(results_10x_1)
results_10x_1.to_csv('../results/performance_10x_donor_1.csv')
results_10x_1

split: 0,  model: concat
split: 0,  model: moe
split: 0,  model: poe
split: 0,  model: tcr
split: 0,  model: rna
split: 1,  model: concat
split: 1,  model: moe
split: 1,  model: poe
split: 1,  model: tcr
split: 1,  model: rna
split: 2,  model: concat
split: 2,  model: moe
split: 2,  model: poe
split: 2,  model: tcr
split: 2,  model: rna
split: 3,  model: concat
split: 3,  model: moe
split: 3,  model: poe
split: 3,  model: tcr
split: 3,  model: rna
split: 4,  model: concat
split: 4,  model: moe
split: 4,  model: poe
split: 4,  model: tcr
split: 4,  model: rna


Unnamed: 0,model,split,metrics,scores
0,concat,0,Prediction test,0.838451
1,concat,0,NMI,0.482612
2,moe,0,Prediction test,0.844636
3,moe,0,NMI,0.503368
4,poe,0,Prediction test,0.822965
5,poe,0,NMI,0.486662
6,tcr,0,Prediction test,0.858096
7,tcr,0,NMI,0.481615
8,rna,0,Prediction test,0.443517
9,rna,0,NMI,0.4757


In [8]:
dataset = '10x'
donor = 2
metadata = ['binding_name', 'clonotype', 'donor']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_10x_data(donor, split)
    for model_name in ['concat', 'moe', 'poe', 'tcr', 'rna']:
        print(f'split: {split},  model: {model_name}')
        model = load_model(data, dataset, split, model_name, donor)
        test_embedding_func = get_model_prediction_function(model)
        for source in ['test', 'val']:
            summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                        label_pred='binding_name')
            result = summary['knn']['weighted avg']['f1-score']
            
            model_names.append(model_name)
            splits.append(split)
            metrics.append(f'Prediction {source}')
            scores.append(result)
        
        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        model_names.append(model_name)
        splits.append(split)
        metrics.append('NMI')
        scores.append(best_nmi)
        
results_10x_2 = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_10x_2 = pd.DataFrame(results_10x_2)
results_10x_2.to_csv('../results/performance_10x_donor_2.csv')
results_10x_2

split: 0,  model: concat
split: 0,  model: moe
split: 0,  model: poe
split: 0,  model: tcr
split: 0,  model: rna
split: 1,  model: concat
split: 1,  model: moe
split: 1,  model: poe
split: 1,  model: tcr
split: 1,  model: rna
split: 2,  model: concat
split: 2,  model: moe
split: 2,  model: poe
split: 2,  model: tcr
split: 2,  model: rna
split: 3,  model: concat
split: 3,  model: moe
split: 3,  model: poe
split: 3,  model: tcr
split: 3,  model: rna
split: 4,  model: concat
split: 4,  model: moe
split: 4,  model: poe
split: 4,  model: tcr
split: 4,  model: rna


Unnamed: 0,model,split,metrics,scores
0,concat,0,Prediction test,0.876378
1,concat,0,Prediction val,0.909374
2,concat,0,NMI,0.340115
3,moe,0,Prediction test,0.880487
4,moe,0,Prediction val,0.858350
...,...,...,...,...
70,tcr,4,Prediction val,0.895432
71,tcr,4,NMI,0.320011
72,rna,4,Prediction test,0.835030
73,rna,4,Prediction val,0.609624


In [9]:
metadata = ['T_cells', 'clonotype', 'responsive']

model_names = []
splits = []
metrics = []
scores = []

for split in range(0, 5):
    data = load_covid_data(split)
    for model_name in ['concat', 'moe', 'poe', 'tcr', 'rna']:
        print(f'split: {split},  model: {model_name}')
        model = load_model(data, 'covid', split, model_name)
        test_embedding_func = get_model_prediction_function(model)
        
        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='T_cells', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        model_names.append(model_name)
        splits.append(split)
        metrics.append('NMI_cell_type')
        scores.append(best_nmi)
        
        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='responsive', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        model_names.append(model_name)
        splits.append(split)
        metrics.append('NMI_reactivity')
        scores.append(best_nmi)
        
        
results_covid = {
    'model': model_names,
    'split': splits,
    'metrics': metrics,
    'scores': scores,
}
results_covid = pd.DataFrame(results_covid)
results_covid.to_csv('../results/performance_covid.csv')
results_covid

split: 0,  model: concat
split: 0,  model: moe
split: 0,  model: poe
split: 0,  model: tcr
split: 0,  model: rna
split: 1,  model: concat
split: 1,  model: moe
split: 1,  model: poe
split: 1,  model: tcr
split: 1,  model: rna
split: 2,  model: concat
split: 2,  model: moe
split: 2,  model: poe
split: 2,  model: tcr
split: 2,  model: rna
split: 3,  model: concat
split: 3,  model: moe
split: 3,  model: poe
split: 3,  model: tcr
split: 3,  model: rna
split: 4,  model: concat
split: 4,  model: moe
split: 4,  model: poe
split: 4,  model: tcr
split: 4,  model: rna


Unnamed: 0,model,split,metrics,scores
0,concat,0,NMI_cell_type,0.408943
1,concat,0,NMI_reactivity,0.156583
2,moe,0,NMI_cell_type,0.437105
3,moe,0,NMI_reactivity,0.198819
4,poe,0,NMI_cell_type,0.39922
5,poe,0,NMI_reactivity,0.218664
6,tcr,0,NMI_cell_type,0.428652
7,tcr,0,NMI_reactivity,0.263353
8,rna,0,NMI_cell_type,0.511926
9,rna,0,NMI_reactivity,0.140069


## Write Supplemantary Material S1

In [10]:
path_out = '../results/supplement/S1_benchmarking.xlsx'

results_10x_1.to_excel(path_out, sheet_name='10x_donor1')

with pd.ExcelWriter(path_out, mode='a') as writer:  
    results_10x_2.to_excel(writer, sheet_name='10x_donor2')
    results_covid.to_excel(writer, sheet_name='Covid')