# P-TEAM for novel mutations
In this tutorial, we will demonstrate how to predict with P-TEAM trained on a repertoire of TCRs for the effect of all mutations to a novel TCR.

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
import sys
sys.path.append('../activation-prediction/')
from preprocessing import (add_activation_thresholds, full_aa_features,
                           get_aa_features, get_complete_dataset)

## Load the data
How the data is loaded will highly depend on your input data format. Here, we use the preprocessing functions that we used on our data. However, we will show how to format a minimal input required for training and prediction. Most steps are already explained in `01_within_tcr.ipynb`, so we will skip their explanations here.

Contrary to before, we select 4 TCRs (3 for training, 1 for prediction) and additionally provide the CDR3-alpha and -beta sequence (`cdr3a_aligned`, `cdr3b_aligned`). This sequences were aligned to have the same length across all TCRs via the MUSCLE algorithm using `-` as padding symbol.

In [3]:
epitope = 'SIINFEKL'
base_activation = 83.52
df_data = add_activation_thresholds(get_complete_dataset(epitope), epitope=epitope)

# Select data from only 4 TCRs and one experimental setting
df_data = df_data[df_data['tcr'].isin(['OTI', 'B11', 'B13', 'ED8'])]
df_data = df_data[df_data['normalization']=='AS']
df_data = df_data[df_data['threshold']=='46.9']
df_data = df_data.reset_index(drop=True)

# Note that the base epitope should not be contained in the dataframe
df_data = df_data[df_data['epitope']!=epitope]

df_data = df_data[['epitope',
                   'mut_ami',
                   'mut_pos',
                   'orig_ami',
                   'tcr',
                   'cdr3a_aligned', 
                   'cdr3b_aligned',
                   'activation',
                   'residual',
                   'is_activated',
                  ]]
df_data.head(5)

Unnamed: 0,epitope,mut_ami,mut_pos,orig_ami,tcr,cdr3a_aligned,cdr3b_aligned,activation,residual,is_activated
0,AIINFEKL,A,0,S,ED8,CAVSP--GSGGKLTL-,CASS----PRASNYTF,101.391637,22.261637,True
1,CIINFEKL,C,0,S,ED8,CAVSP--GSGGKLTL-,CASS----PRASNYTF,81.356164,2.226164,True
2,DIINFEKL,D,0,S,ED8,CAVSP--GSGGKLTL-,CASS----PRASNYTF,88.844168,9.714168,True
3,EIINFEKL,E,0,S,ED8,CAVSP--GSGGKLTL-,CASS----PRASNYTF,101.594015,22.464015,True
4,FIINFEKL,F,0,S,ED8,CAVSP--GSGGKLTL-,CASS----PRASNYTF,98.659526,19.529526,True


Finally, we randomly select `B13` for prediction while training on the remaining data.

In [4]:
np.random.seed(42)
mask_train = df_data['tcr'] != 'B13'
df_data_train = df_data[mask_train].copy()
df_data_pred = df_data[~mask_train].copy()
print('Training data: ', len(df_data_train))
print('Prediction data: ', len(df_data_pred))

Training data:  456
Prediction data:  152


## Calculate the features
If you provide the `df_data_train` and `df_data_pred` as described above, you can proceed with the remaining tutorial without adjustments.

Note, that contrary to before we also encode the TCR sequences as model input.

In [5]:
aa_features = get_aa_features()[['factors']]
df_features_train = full_aa_features(df_data_train, aa_features, include_tcr=True, base_peptide=epitope)
df_features_pred = full_aa_features(df_data_pred, aa_features, include_tcr=True, base_peptide=epitope)
df_features_pred.head()

Unnamed: 0,mut_pos,mut_ami$factors$factor0,mut_ami$factors$factor1,mut_ami$factors$factor2,mut_ami$factors$factor3,mut_ami$factors$factor4,orig_ami$factors$factor0,orig_ami$factors$factor1,orig_ami$factors$factor2,orig_ami$factors$factor3,...,cdr3b_14$factors$factor0,cdr3b_14$factors$factor1,cdr3b_14$factors$factor2,cdr3b_14$factors$factor3,cdr3b_14$factors$factor4,cdr3b_15$factors$factor0,cdr3b_15$factors$factor1,cdr3b_15$factors$factor2,cdr3b_15$factors$factor3,cdr3b_15$factors$factor4
306,0.0,-0.591,-1.302,-0.733,1.57,-0.146,-0.228,1.399,-4.76,0.67,...,0.26,0.83,3.097,-0.838,1.512,-1.006,-0.59,1.891,-0.397,0.412
307,0.0,-1.343,0.465,-0.862,-1.02,-0.255,-0.228,1.399,-4.76,0.67,...,0.26,0.83,3.097,-0.838,1.512,-1.006,-0.59,1.891,-0.397,0.412
308,0.0,1.05,0.302,-3.656,-0.259,-3.242,-0.228,1.399,-4.76,0.67,...,0.26,0.83,3.097,-0.838,1.512,-1.006,-0.59,1.891,-0.397,0.412
309,0.0,1.357,-1.453,1.477,0.113,-0.837,-0.228,1.399,-4.76,0.67,...,0.26,0.83,3.097,-0.838,1.512,-1.006,-0.59,1.891,-0.397,0.412
310,0.0,-1.006,-0.59,1.891,-0.397,0.412,-0.228,1.399,-4.76,0.67,...,0.26,0.83,3.097,-0.838,1.512,-1.006,-0.59,1.891,-0.397,0.412


## Prediction
We perform prediction for all mutations of the novel TCR. As we trained on 3 TCRs only, the results might not be optimal.
### Classification

In [6]:
def predict_classification_score(x_train, y_train, x_pred, y_pred):
    clf = RandomForestClassifier(n_estimators=1000,
                                 random_state=42)
    clf = clf.fit(x_train, y_train['is_activated'])

    preds = clf.predict_proba(x_pred)
    preds = preds[:, (1 if preds.shape[1] == 2 else 0)]
    y_pred['class_score'] = preds
    return y_pred

In [7]:
df_pred = predict_classification_score(df_features_train, df_data_train, df_features_pred, df_data_pred)
df_pred.head()

Unnamed: 0,epitope,mut_ami,mut_pos,orig_ami,tcr,cdr3a_aligned,cdr3b_aligned,activation,residual,is_activated,class_score
306,AIINFEKL,A,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,26.326591,-5.193409,False,0.355
307,CIINFEKL,C,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,1.957516,-29.562484,False,0.358
308,DIINFEKL,D,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,3.555488,-27.964512,False,0.68
309,EIINFEKL,E,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,3.555488,-27.964512,False,0.355
310,FIINFEKL,F,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,0.731072,-30.788928,False,0.352


### Regression

In [8]:
def predict_regression_score(x_train, y_train, x_pred, y_pred):
    rfreg = RandomForestRegressor(n_estimators=250,
                                  max_features='sqrt',
                                  criterion='mae')
    rfreg = rfreg.fit(x_train, y_train['residual'])
    preds = rfreg.predict(x_pred)
    y_pred['reg_score'] = preds
    return y_pred

In [9]:
df_pred = predict_regression_score(df_features_train, df_data_train, df_features_pred, df_data_pred)
df_pred.head()

Unnamed: 0,epitope,mut_ami,mut_pos,orig_ami,tcr,cdr3a_aligned,cdr3b_aligned,activation,residual,is_activated,class_score,reg_score
306,AIINFEKL,A,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,26.326591,-5.193409,False,0.355,-34.668892
307,CIINFEKL,C,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,1.957516,-29.562484,False,0.358,-43.259701
308,DIINFEKL,D,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,3.555488,-27.964512,False,0.68,-27.421946
309,EIINFEKL,E,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,3.555488,-27.964512,False,0.355,-39.151389
310,FIINFEKL,F,0,S,B13,CAMRE-GTGGYKVVF-,CASSD---GTGHEQYF,0.731072,-30.788928,False,0.352,-41.923624
