# How to add a new test-case to this benchmark.

Adding a new test is a bit more complicated, as the implementation depends on the metrics choosen by the developer. Therefore, we will here showcase the construction of the viral benchmark, from which developers can then adapt for their datasets.

## The AbstractTest-class
The custom test class below is supported by the AbstractTest class. Feel free to overwrite any element if it does not suit your purpose, but generally you can simply build a test inheriting from this one.

In [1]:
import os
import abc
import warnings
import pandas as pd
import numpy as np

import tcr_benchmark.utils.config as config
from tcr_benchmark.pp.datasets import get_dataset


class AbstractTest(abc.ABC):
    def __init__(self, name, path_out):
        """
        Abstract test class for automatic testing.
        :param name: str, name of the test
        :param path_out: str, folder where results will be stored
        """
        self.ds_name = name  # Name of the dataset (e.g. viral)

        self.df_base_data = get_dataset(self.ds_name)  # load the base data for this test (see also tcr_benchmark.pp.datasets)

        self.path_out = f"{config.path_results}" if path_out is None else path_out  # Create an output path, if none is provided
        os.makedirs(self.path_out, exist_ok=True)

        self.test_settings = {}

    def run_tests(self, predictor, name=None, config_predictor=None):
        """

        :return:
        """
        # Run the prediction based on the function you will provide (e.g. forming TCR-epitope pairs) and saves the output
        prediction = self.run_prediction(predictor, config_predictor)
        prediction.to_csv(f"{self.path_out}/predictions_{name}_{self.ds_name}.csv")

        # Warns you if there are NaNs in the prediction. This occurs, when the predictor cannot handle the input correctly. E.g. to long sequences, missing information, unknown categories
        mask_nans = prediction["Score"].isna()
        if mask_nans.sum() > 0:
            warnings.warn(f"Filtering {np.sum(mask_nans)} elements for {name} due to NaN prediction.", stacklevel=-1)
            prediction = prediction[~mask_nans]

        # You can specify different tests in self.test_settings. All of them will be run and the saved to the results folder
        results = []
        for metric_type, test_func in self.test_settings.items():
            df_tmp = test_func(prediction)
            df_tmp["Metric_Type"] = metric_type
            results.append(df_tmp)
        results = pd.concat(results)

        # Compute the average and weighted average (by test support) of the metrics
        groups = []
        supports = []
        metrics = []
        values = []
        datasets = []
        types = []
        for i, row in results[["Metric", "Metric_Type"]].drop_duplicates().iterrows():
            m = row["Metric"]
            c = row["Metric_Type"]
            df_tmp = results[(results["Metric"] == m) & (results["Group"] != "full_data")]
            average = df_tmp["Value"].mean()
            weighted = (df_tmp["Value"] * df_tmp["Support"] / df_tmp["Support"].sum()).sum()
            groups += ["Average", "WeightedAverage"]
            supports += [len(df_tmp)] * 2
            metrics += [m] * 2
            values += [average, weighted]
            datasets += ["All"] * 2
            types += [c] * 2
        results_avg = pd.DataFrame({
            "Group": groups,
            "Support": supports,
            "Metric": metrics,
            "Value": values,
            "Dataset": datasets,
            "Metric_Type": types,
        })
        results = pd.concat([results, results_avg])

        # Last formating and saving the output
        results = results.reset_index(drop=True)
        results["Method"] = name
        results = results[["Method", "Dataset", "Group", "Support", "Metric_Type", "Metric", "Value"]]
        results.to_csv(f"{self.path_out}/results_{name}_{self.ds_name}.csv")
        return results

    @abc.abstractmethod
    def run_prediction(self, prediction_func, config_predictor):
        """
        To reduce computational load, predictions are conducted once in this function covering all test settings.
        :param config_predictor:
        :param prediction_func: function, that receives a pd.DataFrame, and returns the dataframe with a binding score
        :return: pd.DataFrame, containing TCR-epitope pairs, binding label, and prediction score
        """
        # This function, you will need to implement. This mainly takes care of preprocessing the data into the right format, forming all required TCR-Epitope pairs, and running the prediction
        raise NotImplementedError

    def save_results(self, results):
        """
        Stores the results to disk.
        :param results: dict {name, results as pd.DataFrame} containin the results of the individual tests
        :return: writes results to "{path_results}/{dataset_name}_{test_name}.csv"
        """
        for test_name, result in results.items():
            path_res = f"{self.path_out}/{self.ds_name}_{test_name}.csv"
            result.to_csv(path_res)

## The actual test
The viral test case. This is just an example. You will need to adapt this for your own custom test. Note, that depending on your test scenario you will want to define negatives differently (in `run_prediciton`) or use different metrics.

In [2]:
import warnings
import pandas as pd
import numpy as np
from tcr_benchmark.eval.abstractTests import AbstractTest
import tcr_benchmark.eval.metrics as metrics

class ViralTest(AbstractTest):
    def __init__(self, path_out):
        """
        :param path_out: Path to the output CSV file to which the results will be stored. If None, the path will be constructed from the name (see below).
        """
        super().__init__("viral", path_out)  # Choose a name for your benchmark here

        # Specify the different test functions to run and a name for them
        self.test_settings = {
            "MPS": self.run_multiple_peptide_selection_test,  # In this test setting, you have 1 TCR and X options of epitopes from which you want to choose the highest one. Exactly one of the epitopes should be annotated as binding (Label = 1)
            "TTP": self.run_tcr_peptide_pairing_test,  # In this test setting, you have pairs of TCRs and epitopes annotated as binders (Label=1) or non-binders (Label=0). Note, in order to calculate classification metrics such as AUC you will need at least one positive and one negative pair
        }

        #
        self.test_data = None

    def run_prediction(self, predictor, config_predictor):
        """
        This function performs the prediction for the whole data. It is useful to not run the prediction separately for each test to safe computational resources
        :param predictor: a function that follows the interface of these test (see tutorials/01_ne_method.ipynb)
        :param config_predictor: kwargs passed to this method, (e.g. to select different model choices)
        :return:
        """
        # For MPS we want to predict between every combination of TCR and epitopes. We therefore constract a dataframe with all combinations. 
        # The input data are TCR-Epitope pairs that bind
        epitope_mhcs = self.df_base_data[["Epitope", "MHC"]].drop_duplicates().values  # Select Unique MHC-Epitope combinations
        data_full = []
        for epitope, mhc in epitope_mhcs:
            # For all TCRs of the dataframe, add this specific MHC-Epitope
            df_tmp = self.df_base_data.copy()
            df_tmp["Epitope"] = epitope
            df_tmp["MHC"] = mhc
            data_full.append(df_tmp)
        data_full = pd.concat(data_full)
        data_full = pd.merge(data_full, self.df_base_data, how="left", indicator="Label")  # Will have Label == both, if the columns of the new data also occur in the positive binding dataframe
        data_full["Label"] = np.where(data_full.Label == "both", 1, 0)  # If this is the case, the label will be 1 (== binders) otherwise 0 (== non-binders)

        prediction = predictor(data_full, **config_predictor)  # We now use the predictor to provide a "Score" for all these combinations
        return prediction

    def run_multiple_peptide_selection_test(self, prediction):
        # Warning due to NaN predictions. This could occur if information required for the predictors are missing, have wrong sequence length, or are not allowed.
        if np.sum(prediction["Score"].notna()) != len(prediction):
            warnings.warn(f"Filter out {np.sum(prediction['Score'].isna())} Predictions due to NaN values. "
                          f"Metrics invalid")
            prediction = prediction[prediction["Score"].isna()]

        # Currently, the prediction results are in long-format (TCR-epitope pairs), but the metrics work on broad format (TCR - epitope1, epitope2, ..., epitopeN)
        # So let's pivot the table, and create labels, which of these predictions is correct
        prediction["Epitope_MHC"] = prediction["Epitope"] + "_" + prediction["MHC"]
        prediction = prediction.drop(columns=["Epitope", "MHC"])
        labels = prediction.pivot_table(index=["CDR3_alpha", "V_alpha", "J_alpha", "CDR3_beta", "V_beta", "J_beta"],
                                        columns=["Epitope_MHC"], values="Label")
        prediction = prediction.pivot_table(
            index=["CDR3_alpha", "V_alpha", "J_alpha", "CDR3_beta", "V_beta", "J_beta"],
            columns=["Epitope_MHC"], values="Score")

        epitopes = prediction.columns
        labels = labels[epitopes]
        labels = labels.apply(lambda x: "".join([x[el] * el for el in epitopes]), axis=1)

        scores = metrics.calculated_rank_metrics(labels, prediction, labels, [1, 3, 5, 8])  # Rank metrics are based on the ordering of prediction scores, the list indicates what Ks to choose for R@K
        scores["Dataset"] = "Viral"  # Providing a name makes it easier to track from which dataset the csv file originates
        return scores

    def run_tcr_peptide_pairing_test(self, prediction):
        # This is a bit more straight forward. Here, we calculate classical classification metrics on the dataset. We just need Label and Score, and the name of the epitope to form groups.
        scores = metrics.calculate_score_metrics(prediction["Label"], prediction["Score"], prediction["Epitope"])  # AUC and APS
        scores_class = metrics.calculate_classification_metrics(prediction["Label"], prediction["Score"],
                                                                prediction["Epitope"])  # F1-Score, Accuracy, Precision, Recall
        scores = pd.concat([scores, scores_class])
        scores["Dataset"] = "Viral"
        return scores

## 03. Running a test
You can now run the test on individual or all predictors.

In [3]:
import numpy as np

def prediction_func(df_input, **kwargs):
    """ See notebook 01_new_method. """
    df_output = df_input.copy()
    np.random.seed(0)
    df_output["Score"] = np.random.rand(len(df_output))
    return df_output

In [4]:
test = ViralTest(None)
results_dataset = test.run_tests(prediction_func, 'dummy_predictor', {})

To conduct the test on all ePytope-TCR predictors, you could loop over all methods, or use the provided configs to adapt `tcr_benchmark.study.benchmark_alternatives`.

In [5]:
from epytope.TCRSpecificityPrediction import TCRSpecificityPredictorFactory
for name, version in TCRSpecificityPredictorFactory.available_methods().items():
    print(name, ",".join(version))

imrex 
titan 1.0.0
tcellmatch 
stapler 
ergo-ii 
pmtnet 
epitcr 
atm-tcr 
attntap 
teim 
bertrand 
ergo-i 
teinet 
panpep 
dlptcr 
tulip-tcr 
itcep 
nettcr 2.2
mixtcrpred 
tcrgp 
