**Language models for TCR specificity prediction**

In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from transformers import TFBertModel, BertTokenizer

file_path = "vdjdb_full.txt"
df = pd.read_csv(file_path, delimiter='\t')
df = df.drop_duplicates()
print(df.shape)

  df = pd.read_csv(file_path, delimiter='\t')


(61636, 34)


In [3]:
df = df[(df['vdjdb.score'] > 0)]
df_slim = df[['cdr3.alpha','cdr3.beta','species','antigen.epitope','antigen.gene','vdjdb.score']]
print(df_slim.head(5))
print(df_slim.shape)

       cdr3.alpha             cdr3.beta      species antigen.epitope  \
0   CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF  HomoSapiens        FLKEKGGL   
1             NaN   CASSFEAGQGFFSNQPQHF  HomoSapiens        FLKEKGGL   
2  CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF  HomoSapiens        FLKEKGGL   
3     CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF  HomoSapiens        FLKEKGGL   
4   CAYRPPGTYKYIF        CASSALASLNEQFF  HomoSapiens        FLKEKGGL   

  antigen.gene  vdjdb.score  
0          Nef            2  
1          Nef            2  
2          Nef            2  
3          Nef            2  
4          Nef            2  
(9300, 6)


In [4]:
#Stick CRD3 together if both alpha and beta present if not leave alone
df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')
cdr3comb = df_slim['cdr3combined'].dropna()
cdr3comb = cdr3comb.reset_index(drop=True)
df_slim = df_slim.sort_values('antigen.epitope')
print(df_slim.head(3))
print(cdr3comb[0])

            cdr3.alpha          cdr3.beta      species antigen.epitope  \
19746    CAGAIPRDDKIIF   CASSLNPGRSDSPLHF  HomoSapiens       AAFKRSCLK   
19745  CALATHTGTASKLTF   CASSQDPGSSYNEQFF  HomoSapiens       AAFKRSCLK   
19744     CAGARNDYKLSF  CATSRDGAGLVNQPQHF  HomoSapiens       AAFKRSCLK   

      antigen.gene  vdjdb.score                     cdr3combined  
19746         T-Ag            3    CAGAIPRDDKIIFCASSLNPGRSDSPLHF  
19745         T-Ag            3  CALATHTGTASKLTFCASSQDPGSSYNEQFF  
19744         T-Ag            3    CAGARNDYKLSFCATSRDGAGLVNQPQHF  
CIVRAPGRADMRFCASSYLPGQGDHYSNQPQHF


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')


In [5]:
max_length = max(cdr3comb, key=lambda x: len(x))
print(len(max_length))


38


In [6]:
class TCRBertModel(tf.keras.Model):
    def __init__(self, bert_model_name='bert-base-uncased'): #uncased produces 728 dimensional embeddings
        super(TCRBertModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)

    def call(self, inputs):
        outputs = self.bert(inputs)
        sequence_output = outputs[0]
        return sequence_output

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #load the tokenizer for the BERT model

#tokenize the TCR sequences
tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38) #1500 is max before ResourceExhaustedError on local machine

#convert the tokenized sequences to tensors
input_ids = tf.constant(tokenized_inputs['input_ids'])
attention_mask = tf.constant(tokenized_inputs['attention_mask']) #decides what should not be attended to as some are tokens are padded to 38 length

#dictionary mapping input names to their values
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

model = TCRBertModel()

#get 768-dimensional embeddings for each token in the input sequences
outputs = model(inputs)

  from .autonotebook import tqdm as notebook_tqdm
  tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38) #1500 is max before ResourceExhaustedError on local machine
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, 

In [17]:
print(outputs)

tf.Tensor(
[[[-6.74676895e-01 -9.55889672e-02  2.27257103e-01 ... -2.54138976e-01
    3.10018569e-01  5.83854079e-01]
  [ 2.77730018e-01 -9.40572396e-02  8.77284884e-01 ... -8.00789297e-01
   -1.50567651e-01  4.35288161e-01]
  [ 2.92913169e-01  4.59451556e-01  1.08082390e+00 ... -8.12520683e-01
   -2.80620456e-01  3.17230560e-02]
  ...
  [-3.54200393e-01  2.29425773e-01  7.44987309e-01 ... -1.03679508e-01
   -2.60174066e-01  1.34258354e-02]
  [-4.16917026e-01  4.15420309e-02  8.02270353e-01 ... -1.34316921e-01
   -1.87624186e-01  1.06884927e-01]
  [-4.90339339e-01  3.35844398e-01  7.81793892e-01 ... -1.80011958e-01
   -1.80875748e-01 -9.65448283e-03]]

 [[-7.20090151e-01 -1.10861301e-01  3.37439865e-01 ... -1.00468241e-01
    2.06924498e-01  7.44393647e-01]
  [-8.84186774e-02 -2.29257658e-01  8.88929188e-01 ... -3.72530460e-01
   -7.12003186e-02  4.31933343e-01]
  [-5.37012279e-01  3.01813960e-01  1.07349384e+00 ... -2.14626536e-01
    1.74486801e-01  2.32736960e-01]
  ...
  [-7.676709

**Reduce to 50 dim and use SVM to classify**

In [33]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#flatten the outputs to 2D
flattened_outputs = tf.reshape(outputs, [outputs.shape[0], -1])
pca = PCA(n_components=50)
reduced_outputs = pca.fit_transform(flattened_outputs.numpy())  #tensor to numpy array before passing to PCA

X_train, X_test, y_train, y_test = train_test_split(reduced_outputs, df_slim['antigen.epitope'][0:1500], test_size=0.2, random_state=111)

svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

  X_train, X_test, y_train, y_test = train_test_split(reduced_outputs, df_slim['antigen.epitope'][0:1500], test_size=0.2, random_state=111)


Accuracy: 0.47333333333333333


**THIS IS BASIC IMPLEMENTATION ABOVE**

**BELOW IS FOLLOWING PAPER https://www.biorxiv.org/content/10.1101/2021.11.18.469186v1.full**

Next step is to implement masked amino acid modelling "hide, or “mask” 15% of the amino acids in each TCR amino acid sequence in the training set, and train TCR-BERT to predict these masked amino acids"




In [36]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

class TCRBertModel(tf.keras.Model):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=None): 
        super(TCRBertModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)
        self.maa_head = tf.keras.layers.Dense(20, activation='softmax')  #MAA head for predicting masked amino acids 20 as thats how many amino acids there are
        self.classification_head = tf.keras.layers.Dense(num_labels, activation='softmax') if num_labels else None  #classification head for downstream tasks

    def call(self, inputs, training=False):
        outputs = self.bert(inputs, training=training)
        sequence_output = outputs.last_hidden_state
        maa_predictions = self.maa_head(sequence_output) #predict masked amino acids from output of BERT model 768 output to 20 amino acids
        if training:
            return maa_predictions
        else:
            classification_logits = self.classification_head(tf.reduce_mean(sequence_output, axis=1)) #takes the mean of the sequence output and passes it through the classification head 
            return maa_predictions, classification_logits

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38)


input_ids = tf.constant(tokenized_inputs['input_ids'])
attention_mask = tf.constant(tokenized_inputs['attention_mask'])

#inputs dictionary
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

model = TCRBertModel()
maa_predictions = model(inputs, training=True)


  tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [37]:
print(maa_predictions)

tf.Tensor(
[[[0.01330471 0.03060793 0.03945042 ... 0.03930859 0.02659958 0.05257806]
  [0.03285696 0.09712847 0.02395576 ... 0.05039798 0.0591586  0.0274514 ]
  [0.04078352 0.0518089  0.06029619 ... 0.04855753 0.01902218 0.01815997]
  ...
  [0.02975798 0.05448749 0.05213316 ... 0.03754457 0.03323095 0.03513494]
  [0.02713712 0.07107564 0.05515148 ... 0.03899817 0.03625135 0.03669605]
  [0.03008739 0.06630372 0.04762534 ... 0.03806898 0.03547477 0.04136902]]

 [[0.0197041  0.03878022 0.05492344 ... 0.0444821  0.04129329 0.10465682]
  [0.03690844 0.04299334 0.03430608 ... 0.03004827 0.04652827 0.02210557]
  [0.08066549 0.0574407  0.0546074  ... 0.04275035 0.0210504  0.02547678]
  ...
  [0.03700403 0.05686714 0.02856055 ... 0.02641634 0.02939809 0.050872  ]
  [0.03169116 0.05300458 0.03902589 ... 0.03016533 0.0240878  0.03085233]
  [0.03559051 0.05108725 0.04177984 ... 0.02913353 0.02373109 0.03068066]]

 [[0.01923585 0.05690232 0.04598095 ... 0.02798671 0.03595887 0.04879451]
  [0.026465

In [44]:
pca = PCA(n_components=50)  

maa_predictions_np = maa_predictions.numpy()  
flattened_maa_predictions_np = maa_predictions_np.reshape(maa_predictions_np.shape[0], -1)
pca_outputs = pca.fit_transform(flattened_maa_predictions_np)

labels = df_slim['antigen.epitope'][0:1500]
X_train, X_test, y_train, y_test = train_test_split(pca_outputs, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='rbf', C=1.0)
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


  labels = df_slim['antigen.epitope'][0:1500]


Accuracy: 0.31


*Somehow worse*

In [58]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

class TCRBertModel(tf.keras.Model):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=None): 
        super(TCRBertModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)
        self.maa_head = tf.keras.layers.Dense(20, activation='softmax')  #MAA head for predicting masked amino acids
        self.classification_head = tf.keras.layers.Dense(num_labels, activation='softmax') if num_labels else None  #classification head for downstream tasks

    def call(self, inputs, training=False):
        outputs = self.bert(inputs, training=training)
        sequence_output = outputs.last_hidden_state
        maa_predictions = self.maa_head(sequence_output) #predict masked amino acids from the output of the BERT model
        if training:
            return maa_predictions
        else:
            classification_logits = self.classification_head(tf.reduce_mean(sequence_output, axis=1)) #take the mean of the sequence output and pass it through the classification head 
            return maa_predictions, classification_logits


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38)

#convert the tokenized sequences to tensors
input_ids = tf.constant(tokenized_inputs['input_ids'])
attention_mask = tf.constant(tokenized_inputs['attention_mask'])


inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
model = TCRBertModel()

#MAA pre-training objective
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.CategoricalCrossentropy()

model.compile(optimizer=optimizer, loss=loss_fn)
model.fit(inputs, maa_predictions, epochs=3, batch_size=128)


  tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20154f93910>

In [60]:
maa_predictions = model(inputs, training=True)

In [61]:
pca = PCA(n_components=50)  

maa_predictions_np = maa_predictions.numpy()  
flattened_maa_predictions_np = maa_predictions_np.reshape(maa_predictions_np.shape[0], -1)
pca_outputs = pca.fit_transform(flattened_maa_predictions_np)

labels = df_slim['antigen.epitope'][0:1500]
X_train, X_test, y_train, y_test = train_test_split(pca_outputs, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='rbf', C=1.0)
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


  labels = df_slim['antigen.epitope'][0:1500]


Accuracy: 0.2966666666666667
