In [1]:
import pandas as pd
from tqdm import tqdm
import scanpy as sc

In [2]:
import sys
sys.path.append('../mvTCR/')
import tcr_embedding.utils_training as utils
import config.constants_10x as const

from tcr_embedding.utils_preprocessing import stratified_group_shuffle_split, group_shuffle_split
from tcr_embedding.evaluation.Imputation import run_imputation_evaluation
from tcr_embedding.evaluation.Clustering import run_clustering_evaluation
from tcr_embedding.evaluation.kNN import run_knn_within_set_evaluation
from tcr_embedding.evaluation.WrapperFunctions import get_model_prediction_function

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_model(adata, dataset, split, donor=''):
    path_model = f'saved_models/journal_2/10x/beta/10x_donor_{donor}_split_{split}_moe_beta'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model 

In [4]:
from sklearn.preprocessing import OneHotEncoder
def load_10x_data(donor, split):
    adata = utils.load_data('10x')
    if str(donor) != 'None':
        adata = adata[adata.obs['donor'] == f'donor_{donor}']
    else:
        enc = OneHotEncoder(sparse=False)
        enc.fit(adata.obs['donor'].to_numpy().reshape(-1, 1))
        adata.obsm['donor'] = enc.transform(adata.obs['donor'].to_numpy().reshape(-1, 1))

    adata = adata[adata.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS)]


    # subsample to get statistics
    random_seed = split

    adata.obs['group_col'] = [seq[1:-1] for seq in adata.obs['IR_VDJ_1_junction_aa']]
    train_val, test = group_shuffle_split(adata, group_col='group_col', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='group_col', val_split=0.25, random_seed=random_seed)

    adata.obs['set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'
    return adata

In [5]:
dataset = '10x'
donor = 1
metadata = ['binding_name', 'clonotype', 'donor']

splits = []
metrics = []
scores = []
donors = []
for donor in ['None'] + list(range(1, 5)):
    for split in tqdm(range(0, 5)):
        data = load_10x_data(donor, split)
        model = load_model(data, dataset, split, donor)
        test_embedding_func = get_model_prediction_function(model)
        for source in ['test']:  #, 'val']:
            summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                        label_pred='binding_name')
            result = summary['knn']['weighted avg']['f1-score']

            splits.append(split)
            metrics.append(f'Prediction {source}')
            scores.append(result)
            donors.append(donor)

        best_nmi = -99
        for resolution in [0.01, 0.1, 1.0]:
            cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='binding_name', 
                                               cluster_params={'resolution': resolution, 'num_neighbors': 5})
            best_nmi = max(cluster_result['NMI'], best_nmi)
        splits.append(split)
        metrics.append('NMI')
        scores.append(best_nmi)  
        donors.append(donor)

results_10x = {
    'model': ['moe_beta'] * len(splits),
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': donors,
    'dataset': [dataset] * len(splits)
}
results_10x = pd.DataFrame(results_10x)
results_10x.to_csv(f'../results/performance_10x_beta.csv')

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [10:18<00:00, 123.77s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:29<00:00, 18.00s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:18<00:00, 39.79s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:26<00:00, 29.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:53<00:00, 10.61s/it]


In [6]:
results_10x.groupby(['donor', 'metric', 'model'])['score'].mean()

donor  metric           model   
1      NMI              moe_beta    0.592967
       Prediction test  moe_beta    0.636430
2      NMI              moe_beta    0.366572
       Prediction test  moe_beta    0.758192
3      NMI              moe_beta    0.004608
       Prediction test  moe_beta    0.890114
4      NMI              moe_beta    0.722175
       Prediction test  moe_beta    0.744325
None   NMI              moe_beta    0.501714
       Prediction test  moe_beta    0.695971
Name: score, dtype: float64

## Minervina Tests

In [7]:
def load_minervina_model(adata, dataset, split):
    path_model = f'saved_models/journal_2/minervina/beta/'
    path_model += f'minervina_split_{split}_moe_beta'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model

In [8]:
from sklearn.preprocessing import OneHotEncoder
def load_minervina_data(split):
    adata = utils.load_data('minervina/01_annotated_data.h5ad')

    # subsample to get statistics
    random_seed = split
    adata.obs['group_col'] = [seq[1:-1] for seq in adata.obs['IR_VDJ_1_junction_aa']]
    train_val, test = group_shuffle_split(adata, group_col='group_col', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='group_col', val_split=0.25, random_seed=random_seed)

    adata.obs['set'] = None
    adata.obs.loc[train.obs.index, 'set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'
    return adata

In [9]:
dataset = 'minervina'
metadata = ['epitope']

splits = []
metrics = []
scores = []

for split in tqdm(range(0, 5)):
    data = load_minervina_data(split)
    model = load_minervina_model(data, dataset, split)
    test_embedding_func = get_model_prediction_function(model)
    for source in ['test']:  #, 'val']:
        summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
                                    label_pred='epitope')
        result = summary['knn']['weighted avg']['f1-score']

        splits.append(split)
        metrics.append(f'Prediction {source}')
        scores.append(result)
        donors.append(donor)

    best_nmi = -99
    for resolution in [0.01, 0.1, 1.0]:
        cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='epitope', 
                                           cluster_params={'resolution': resolution, 'num_neighbors': 5})
        best_nmi = max(cluster_result['NMI'], best_nmi)
    splits.append(split)
    metrics.append('NMI')
    scores.append(best_nmi)  

results_min = {
    'model': ['moe_beta'] * len(splits),
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': ['-'] * len(splits),
    'dataset': [dataset] * len(splits)
}
results_min = pd.DataFrame(results_min)
results_min.to_csv(f'../results/performance_minervina_beta.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:41<00:00,  8.24s/it]


In [10]:
results_min.groupby(['metric', 'dataset', 'donor', 'model'])['score'].mean()

metric           dataset    donor  model   
NMI              minervina  -      moe_beta    0.442462
Prediction test  minervina  -      moe_beta    0.778035
Name: score, dtype: float64

## Preservance of Cell type and Clonotype

In [11]:
def load_model(adata, dataset, split, model, donor=''):
    if dataset=='covid':
        path_model = f'saved_models/journal_2/Fischer/beta/{dataset}_beta_split_{split}_{model}'
    else:
        path_model = f'saved_models/journal_2/{dataset}/beta/{dataset}_beta_split_{split}_{model}'
    path_model += '.pt'
    model = utils.load_model(adata, path_model)
    return model

In [12]:
def load_data(split, dataset):
    adata = utils.load_data(dataset)
    # subsample to get statistics
    random_seed = split
    
    clone_col = 'TRB_1_cdr3'
    if dataset == 'haniffa':
        clone_col = 'cdr3_beta'
    elif dataset == 'borcherding':
        clone_col = 'IR_VDJ_1_junction_aa'
        
    if dataset in ['haniffa', 'borcherding']:
        sc.pp.subsample(adata, n_obs=20000, random_state=random_seed)
        train, val = group_shuffle_split(adata, group_col=clone_col, val_split=0.25, random_seed=random_seed)
        adata.obs['set'] = 'train'
        adata.obs.loc[val.obs.index, 'set'] = 'val'
        adata = adata[adata.obs['set'].isin(['train', 'val'])]
    
    else:
        sub, non_sub = group_shuffle_split(adata, group_col=clone_col, val_split=0.2, random_seed=random_seed)
        train, val = group_shuffle_split(sub, group_col=clone_col, val_split=0.20, random_seed=random_seed)

        adata.obs['set'] = 'train'
        adata.obs.loc[non_sub.obs.index, 'set'] = '-'
        adata.obs.loc[val.obs.index, 'set'] = 'val'
        adata = adata[adata.obs['set'].isin(['train', 'val'])]

    return adata

In [13]:
dataset_2_celltype = {
    'haniffa': 'full_clustering',
    'covid': 'cell_type',
    'borcherding': 'functional.cluster',
}

In [14]:
model_names = []
splits = []
metrics = []
scores = []
datasets = []
for dataset in ['borcherding', 'covid', 'haniffa']:
    for split in tqdm(range(0, 5)):
        data = load_data(split, dataset)
        for model_name in ['moe']:
            model = load_model(data, dataset, split, model_name)
            test_embedding_func = get_model_prediction_function(model)

            best_nmi = -99
            for resolution in [0.01, 0.1, 1.0, 10]:
                cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', 
                                                           name_label=dataset_2_celltype[dataset], 
                                                   cluster_params={'resolution': resolution, 'num_neighbors': 5})
                best_nmi = max(cluster_result['NMI'], best_nmi)
            model_names.append(model_name)
            splits.append(split)
            metrics.append('NMI_cell_type')
            scores.append(best_nmi)
            datasets.append(dataset)
                
            best_nmi = -99
            for resolution in [0.01, 0.1, 1.0, 10, 25]:
                cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train', name_label='clonotype', 
                                                   cluster_params={'resolution': resolution, 'num_neighbors': 5})
                best_nmi = max(cluster_result['NMI'], best_nmi)
            model_names.append(model_name)
            splits.append(split)
            metrics.append('NMI_clonotype')
            scores.append(best_nmi)
            datasets.append(dataset)
            
            embedding = model


results_perservance = {
    'model': model_names,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'dataset': datasets
}
results_perservance = pd.DataFrame(results_perservance)
results_perservance.to_csv('../results/performance_perservance_beta.csv')
results_perservance

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:04<00:00, 72.95s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:27<00:00, 17.54s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:17<00:00, 87.46s/it]


Unnamed: 0,model,split,metric,score,dataset
0,moe,0,NMI_cell_type,0.157455,borcherding
1,moe,0,NMI_clonotype,0.748353,borcherding
2,moe,1,NMI_cell_type,0.147046,borcherding
3,moe,1,NMI_clonotype,0.755544,borcherding
4,moe,2,NMI_cell_type,0.147931,borcherding
5,moe,2,NMI_clonotype,0.767079,borcherding
6,moe,3,NMI_cell_type,0.158303,borcherding
7,moe,3,NMI_clonotype,0.752436,borcherding
8,moe,4,NMI_cell_type,0.148364,borcherding
9,moe,4,NMI_clonotype,0.754446,borcherding


In [15]:
results_perservance.groupby(['dataset', 'metric'])['score'].mean()

dataset      metric       
borcherding  NMI_cell_type    0.151820
             NMI_clonotype    0.755572
covid        NMI_cell_type    0.405606
             NMI_clonotype    0.811042
haniffa      NMI_cell_type    0.353981
             NMI_clonotype    0.772651
Name: score, dtype: float64

## Write Supplemantary Material S1

In [16]:
path_out = '../results/supplement/S1_benchmarking.xlsx'

with pd.ExcelWriter(path_out, mode='a') as writer:  
    results_10x.to_excel(writer, sheet_name='Specificity_beta_10x')
    results_min.to_excel(writer, sheet_name='Minervina_beta_10x')
    results_perservance.to_excel(writer, sheet_name='CellCharacteristics_beta')