In [1]:
import pandas as pd
from tqdm import tqdm
#import scanpy as sc

In [2]:
import sys
sys.path.append('../mvTCR/')
import tcr_embedding.utils_training as utils
import config.constants_10x as const

from tcr_embedding.utils_preprocessing import stratified_group_shuffle_split, group_shuffle_split
from tcr_embedding.evaluation.Imputation import run_imputation_evaluation
from tcr_embedding.evaluation.Clustering import run_clustering_evaluation
from tcr_embedding.evaluation.kNN import run_knn_within_set_evaluation
from tcr_embedding.evaluation.WrapperFunctions import get_model_prediction_function

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_model(adata, dataset, split, model, donor=''):
    path_model = f'saved_models/journal_2/10x/splits/{model}/10x_donor_{donor}_split_{split}_{model}'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model
    

In [4]:
from sklearn.preprocessing import OneHotEncoder
def load_10x_data(donor, split, size=False):
    adata = utils.load_data('10x')
    if str(donor) != 'None':
        adata = adata[adata.obs['donor'] == f'donor_{donor}']
    else:
        enc = OneHotEncoder(sparse=False)
        enc.fit(adata.obs['donor'].to_numpy().reshape(-1, 1))
        adata.obsm['donor'] = enc.transform(adata.obs['donor'].to_numpy().reshape(-1, 1))
    adata = adata[adata.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS)]
    if split != 'full':
        random_seed = split

        train_val, test = group_shuffle_split(adata, group_col='clonotype', val_split=0.20, random_seed=random_seed)
        train, val = group_shuffle_split(train_val, group_col='clonotype', val_split=0.25, random_seed=random_seed)
    
        if size:
            sc.pp.subsample(train, n_obs=size)

        adata.obs['set'] = None
        adata.obs.loc[train.obs.index, 'set'] = 'train'
        adata.obs.loc[val.obs.index, 'set'] = 'val'
        adata.obs.loc[test.obs.index, 'set'] = 'test'
        adata = adata[adata.obs['set'].isin(['train', 'val', 'test'])]

    return adata

## 10x Specificity

In [5]:
dataset = '10x'
donor = 1
metadata = ['binding_name', 'clonotype', 'donor']

model_names = []
splits = []
metrics = []
scores = []
donors = []
for donor in list(range(1, 5)) + ['None']:
    for split in tqdm(range(0, 5)):
        data = load_10x_data(donor, split)
        for model_name in ['concat', 'moe', 'poe', 'tcr', 'rna']:
            model = load_model(data, dataset, split, model_name, donor)
            test_embedding_func = get_model_prediction_function(model)
            for source in ['test']:  #, 'val']:
                summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                            label_pred='binding_name')
                result = summary['knn']['weighted avg']['f1-score']

                model_names.append(model_name)
                splits.append(split)
                metrics.append(f'Prediction {source}')
                scores.append(result)
                donors.append(donor)

            best_nmi = -99
            for resolution in [0.01, 0.1, 1.0]:
                cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                                   cluster_params={'resolution': resolution, 'num_neighbors': 5})
                best_nmi = max(cluster_result['NMI'], best_nmi)
            model_names.append(model_name)
            splits.append(split)
            metrics.append('NMI')
            scores.append(best_nmi)  
            donors.append(donor)

results_10x = {
    'model': model_names,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': donors,
    'dataset': [dataset] * len(splits)
}
results_10x = pd.DataFrame(results_10x)
results_10x.to_csv(f'../results/performance_10x.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:15<00:00, 63.16s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [15:29<00:00, 185.84s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:42<00:00, 104.41s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:27<00:00, 29.58s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [42:21<00:00, 508.36s/it]


## Minervina Tests

In [6]:
def load_minervina_model(adata, dataset, split, model):
    path_model = f'saved_models/journal_2/minervina/splits/{model}/'
    path_model += f'minervina_split_{split}_{model}'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model

In [7]:
from sklearn.preprocessing import OneHotEncoder
def load_minervina_data(split, size=False):
    adata = utils.load_data('minervina/01_annotated_data.h5ad')
    # subsample to get statistics
    random_seed = split
    train_val, test = group_shuffle_split(adata, group_col='clonotype', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='clonotype', val_split=0.25, random_seed=random_seed)
    
    if size:
        sc.pp.subsample(train, n_obs=size)
            
    adata.obs['set'] = None
    adata.obs.loc[train.obs.index, 'set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'
    return adata

In [8]:
dataset = 'minervina'
metadata = ['epitope']

model_names = []
splits = []
metrics = []
scores = []

for split in tqdm(range(0, 5)):
    data = load_minervina_data(split)
    for model_name in ['concat', 'moe', 'poe', 'tcr', 'rna']:
        model = load_minervina_model(data, dataset, split, model_name)
        test_embedding_func = get_model_prediction_function(model)
        for source in ['test']:  #, 'val']:
            summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                        label_pred='epitope')
            result = summary['knn']['weighted avg']['f1-score']

            model_names.append(model_name)
            splits.append(split)
            metrics.append(f'Prediction {source}')
            scores.append(result)
            donors.append(donor)

        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='epitope', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        model_names.append(model_name)
        splits.append(split)
        metrics.append('NMI')
        scores.append(best_nmi)  

results_min = {
    'model': model_names,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': ['-'] * len(splits),
    'dataset': [dataset] * len(splits)
}
results_min = pd.DataFrame(results_min)
results_min.to_csv(f'../results/performance_minervina.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:22<00:00, 40.59s/it]


## Contribution Test

In [9]:
splits = []
donors = []
scores = []

for donor in list(range(1, 5)) + ['None']:
    for split in tqdm(range(0, 5)):
        data = load_10x_data(donor, split)
        
        model = load_model(data, '10x', split, 'moe', donor)        
        model.get_modality_contribution(data)
        
        result = data.obs['contribution_tcr-rna'].values.mean()
        
        splits.append(split)
        scores.append(result)
        donors.append(donor)
        
for split in tqdm(range(0, 5)):
    data = load_minervina_data(split)
    
    model = load_minervina_model(data, 'minervina', split, 'moe')
    model.get_modality_contribution(data)

    result = data.obs['contribution_tcr-rna'].values.mean()

    splits.append(split)
    scores.append(result)
    donors.append('minervina')

results_contributions = {
    'split': splits,
    'score': scores,
    'donor': donors,
}
results_contributions = pd.DataFrame(results_contributions)
results_contributions.to_csv(f'../results/contribution_10x_minervina.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:47<00:00,  9.56s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:07<00:00, 13.56s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:55<00:00, 11.06s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:38<00:00,  7.64s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:15<00:00, 27.02s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.01s/it]


## Dataset size tests

In [10]:
def load_model_size(adata, dataset, split, size, donor=''):
    path_model = f'saved_models/journal_2/10x/data_size/10x_donor_{donor}_split_{split}_moe_{size}'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model

In [11]:
donor_2_size = {
    'None': [100, 500, 1000, 2500, 5000, 10000, 15000],
    1: [100, 500, 1000, 2500, 5000],
    2: [100, 500, 1000, 2500, 5000, 10000, 15000],
    3: [100, 500, 1000, 2500, 5000, 10000],
    4: [100, 500, 1000, 2500]
}

In [12]:
dataset = '10x'
metadata = ['binding_name', 'clonotype', 'donor']

sizes = []
splits = []
metrics = []
scores = []
donors = []
for donor in list(range(1, 5)) + ['None']:
    for split in tqdm(range(0, 5)):
        for size in donor_2_size[donor]:
            data = load_10x_data(donor, split)
            model = load_model_size(data, dataset, split, size, donor)
            test_embedding_func = get_model_prediction_function(model)
            for source in ['test']:  #, 'val']:
                summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                            label_pred='binding_name')
                result = summary['knn']['weighted avg']['f1-score']

                sizes.append(size)
                splits.append(split)
                metrics.append(f'Prediction {source}')
                scores.append(result)
                donors.append(donor)

            best_nmi = -99
            for resolution in [0.01, 0.1, 1.0]:
                cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                                   cluster_params={'resolution': resolution, 'num_neighbors': 5})
                best_nmi = max(cluster_result['NMI'], best_nmi)
            sizes.append(size)
            splits.append(split)
            metrics.append('NMI')
            scores.append(best_nmi)  
            donors.append(donor)

results_10x_size = {
    'size': sizes,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': donors
}
results_10x_size = pd.DataFrame(results_10x_size)
results_10x_size.to_csv(f'../results/performance_10x_datasize.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:58<00:00, 95.78s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [29:42<00:00, 356.56s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [15:22<00:00, 184.59s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:52<00:00, 46.45s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 5/5 [1:09:14<00:00, 830.91s/it]


### Minervina Datasize

In [13]:
def load_minervina_model_size(adata, dataset, split, size, donor=''):
    path_model = f'saved_models/journal_2/minervina/data_size/minervina_split_{split}_moe_{size}'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model

In [14]:
minervina_sizes = [100, 500, 1000, 2500, 5000]

In [18]:
dataset = 'minervina'
metadata = ['epitope']

sizes = []
splits = []
metrics = []
scores = []

for split in tqdm(range(0, 5)):
    for size in minervina_sizes:
        data = load_minervina_data(split)
        model = load_minervina_model_size(data, dataset, split, size)
        test_embedding_func = get_model_prediction_function(model)
        for source in ['test']:  #, 'val']:
            summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                        label_pred='epitope')
            result = summary['knn']['weighted avg']['f1-score']

            sizes.append(size)
            splits.append(split)
            metrics.append(f'Prediction {source}')
            scores.append(result)

        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='epitope', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        sizes.append(size)
        splits.append(split)
        metrics.append('NMI')
        scores.append(best_nmi)  

results_min_size = {
    'size': sizes,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': [dataset] * len(splits)
}
results_min_size = pd.DataFrame(results_min_size)
results_min_size.to_csv(f'../results/performance_minervina_datasize.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:40<00:00, 56.08s/it]


## Write Supplemantary Material S1

In [20]:
path_out = '../results/supplement/S1_benchmarking.xlsx'
results_10x.to_excel(path_out, sheet_name='Specificity_10x')

In [21]:
with pd.ExcelWriter(path_out, mode='a') as writer: 
    results_min.to_excel(writer, sheet_name='Specificity_Minervina')
    
    results_10x_size.to_excel(writer, sheet_name='Datasize_10x')
    results_min_size.to_excel(writer, sheet_name='Datasize_Minervina')
    
    results_contributions.to_excel(writer, sheet_name='TCR-Contribution_10x_Minervina')