In [1]:
import pandas as pd
from tqdm import tqdm
import scanpy as sc
import os
import shutil

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../mvTCR/')
import tcr_embedding.utils_training as utils
import config.constants_10x as const
from sklearn.metrics import f1_score, classification_report

from tcr_embedding.utils_preprocessing import stratified_group_shuffle_split, group_shuffle_split
from tcr_embedding.evaluation.Imputation import run_imputation_evaluation
from tcr_embedding.evaluation.Clustering import run_clustering_evaluation
from tcr_embedding.evaluation.kNN import run_knn_within_set_evaluation
from tcr_embedding.evaluation.WrapperFunctions import get_model_prediction_function

  from .autonotebook import tqdm as notebook_tqdm


# 10x

In [4]:
def load_model(adata, dataset, split, model, donor):
    path_model = f'../mvTCR/saved_models/journal_2/10x/splits/{model}/10x_donor_{donor}_split_{split}_{model}.pt'
    model = utils.load_model(adata, path_model)
    return model

In [5]:
def load_10x_data(donor, split, size=False):
    adata = utils.load_data('10x')
    if donor is not 'None':
        adata = adata[adata.obs['donor'] == f'donor_{donor}']
    adata = adata[adata.obs['binding_name'].isin(const.HIGH_COUNT_ANTIGENS)]
    
    
    # subsample to get statistics
    random_seed = split
    train_val, test = group_shuffle_split(adata, group_col='clonotype', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='clonotype', val_split=0.25, random_seed=random_seed)
    
    adata.obs['binding_label'] = adata.obs['binding_name'].map(adata.uns['specificity_to_label']).astype(int)
    adata.obs['set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'

    return adata

## 10x Specificity

In [6]:
dataset = '10x'

model_names = []
splits = []
metrics = []
scores = []
donors = []

for donor in tqdm(['1', '2', '3', '4', 'None'], 'donor'):
	for split in tqdm(range(5), 'split'):
		data = load_10x_data(donor, split)

		model = load_model(data, '10x', split, 'supervised', donor)

		model.model = model.model.to('cuda')
		model.supervised_model = model.supervised_model.to(model.device)
		model.model.eval()
		model.supervised_model.eval()

		for source in ['test']:
			data_sub = data[data.obs['set'] == source]

			y_true = data_sub.obs['binding_name'].map(data.uns['specificity_to_label'])

			y_pred = model.predict_label(data_sub, use_mean=True)

			sv_score = classification_report(y_true, y_pred.argmax(1).detach().cpu(), output_dict=True)
			sv_score = sv_score['weighted avg']['f1-score']
            
			model_names.append('mvTCR supervised')
			splits.append(split)
			metrics.append(f'Prediction {source}')
			scores.append(sv_score)
			donors.append(donor)

			test_embedding_func = get_model_prediction_function(model)
			summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
												label_pred='binding_name')
			knn_score = summary['knn']['weighted avg']['f1-score']

			model_names.append('mvTCR supervised + kNN')
			splits.append(split)
			metrics.append(f'Prediction {source}')
			scores.append(knn_score)
			donors.append(donor)

			best_nmi = -99
			for resolution in [0.01, 0.1, 1.0]:
				cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train',
														   name_label='binding_name',
														   cluster_params={'resolution': resolution,
																		   'num_neighbors': 5})
				best_nmi = max(cluster_result['NMI'], best_nmi)
			model_names.append('mvTCR supervised')
			splits.append(split)
			metrics.append('NMI')
			scores.append(best_nmi)  
			donors.append(donor)

results_10x = {
    'model': model_names,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': donors,
    'dataset': [dataset] * len(splits)
}
results_10x = pd.DataFrame(results_10x)
results_10x.to_csv(f'../results/performance_10x_supervised.csv')

donor:   0%|                                                                                                                                                                                                                                                      | 0/5 [00:00<?, ?it/s]
split:   0%|                                                                                                                                                                                                                                                      | 0/5 [00:00<?, ?it/s][A
split:  20%|███████████████████████████████████████████████▌                                                                                                                                                                                              | 1/5 [01:22<05:28, 82.01s/it][A
split:  40%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                           

In [7]:
results_10x.groupby(['model', 'donor', 'metric']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,split,score
model,donor,metric,Unnamed: 3_level_1,Unnamed: 4_level_1
mvTCR supervised,1.0,NMI,2,0.629854
mvTCR supervised,1.0,Prediction test,2,0.433624
mvTCR supervised,2.0,NMI,2,0.541553
mvTCR supervised,2.0,Prediction test,2,0.619305
mvTCR supervised,3.0,NMI,2,0.052228
mvTCR supervised,3.0,Prediction test,2,0.502285
mvTCR supervised,4.0,NMI,2,0.633032
mvTCR supervised,4.0,Prediction test,2,0.281554
mvTCR supervised,,NMI,2,0.609132
mvTCR supervised,,Prediction test,2,0.511023


# Minervina Tests

In [8]:
def load_minervina_model(adata, dataset, split, model):
    path_model = f'../mvTCR/saved_models/journal_2/minervina/splits/{model}/minervina_donor_split_{split}_{model}.pt'
    model = utils.load_model(adata, path_model)
    return model

In [9]:
from sklearn.preprocessing import OneHotEncoder

def load_minervina_data(split, size=False):
    adata = utils.load_data('minervina/01_annotated_data.h5ad')
    adata.obs['epitope_label'] = adata.obs['epitope'].factorize()[0]

    # subsample to get statistics
    random_seed = split
    train_val, test = group_shuffle_split(adata, group_col='clonotype', val_split=0.20, random_seed=random_seed)
    train, val = group_shuffle_split(train_val, group_col='clonotype', val_split=0.25, random_seed=random_seed)
    
    if size:
        sc.pp.subsample(train, n_obs=size)
            
    adata.obs['set'] = None
    adata.obs.loc[train.obs.index, 'set'] = 'train'
    adata.obs.loc[val.obs.index, 'set'] = 'val'
    adata.obs.loc[test.obs.index, 'set'] = 'test'

    return adata

In [10]:
dataset = 'minervina'

model_names = []
splits = []
metrics = []
scores = []
donors = []

for donor in tqdm(['None'], 'donor'):
	for split in tqdm(range(5), 'split'):
		data = load_minervina_data(split)
		model = load_minervina_model(data, 'minervina', split, 'supervised')

		model.model = model.model.to('cuda')
		model.supervised_model = model.supervised_model.to(model.device)
		model.model.eval()
		model.supervised_model.eval()

		for source in ['test']:
			data_sub = data[data.obs['set'] == source]

			y_true = data_sub.obs['epitope_label']

			y_pred = model.predict_label(data_sub, use_mean=True)

			sv_score = classification_report(y_true, y_pred.argmax(1).detach().cpu(), output_dict=True)
			sv_score = sv_score['weighted avg']['f1-score']

			model_names.append('mvTCR supervised')
			splits.append(split)
			metrics.append(f'Prediction {source}')
			scores.append(sv_score)
			donors.append(donor)

			test_embedding_func = get_model_prediction_function(model)
			summary = run_imputation_evaluation(data, test_embedding_func, query_source=source,
												label_pred='epitope_label')
			knn_score = summary['knn']['weighted avg']['f1-score']

			model_names.append('mvTCR supervised + kNN')
			splits.append(split)
			metrics.append(f'Prediction {source}')
			scores.append(knn_score)
			donors.append(donor)

			best_nmi = -99
			for resolution in [0.01, 0.1, 1.0]:
				cluster_result = run_clustering_evaluation(data, test_embedding_func, 'train',
														   name_label='epitope_label',
														   cluster_params={'resolution': resolution,
																		   'num_neighbors': 5})
				best_nmi = max(cluster_result['NMI'], best_nmi)
			model_names.append('mvTCR supervised')
			splits.append(split)
			metrics.append('NMI')
			scores.append(best_nmi)
			donors.append(donor)

results_min = {
    'model': model_names,
    'split': splits,
    'metric': metrics,
    'score': scores,
    'donor': donors,
    'dataset': [dataset] * len(splits)
}
results_min = pd.DataFrame(results_min)
results_min.to_csv(f'../results/performance_minervina_supervised.csv')

donor:   0%|                                                                                                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]
split:   0%|                                                                                                                                                                                                                                                      | 0/5 [00:00<?, ?it/s][A
split:  20%|███████████████████████████████████████████████▌                                                                                                                                                                                              | 1/5 [00:08<00:34,  8.52s/it][A
split:  40%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                           

In [11]:
results_min.groupby(['model', 'metric']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,split,score
model,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
mvTCR supervised,NMI,2,0.720579
mvTCR supervised,Prediction test,2,0.782709
mvTCR supervised + kNN,Prediction test,2,0.8294
