**Language models for TCR specificity prediction**

In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from transformers import TFBertModel, BertTokenizer

file_path = "vdjdb_full.txt"
df = pd.read_csv(file_path, delimiter='\t')
df = df.drop_duplicates()
print(df.shape)

  df = pd.read_csv(file_path, delimiter='\t')


(61636, 34)


In [3]:
df = df[(df['vdjdb.score'] > 0)]
df_slim = df[['cdr3.alpha','cdr3.beta','species','antigen.epitope','antigen.gene','vdjdb.score']]
print(df_slim.head(5))
print(df_slim.shape)

       cdr3.alpha             cdr3.beta      species antigen.epitope  \
0   CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF  HomoSapiens        FLKEKGGL   
1             NaN   CASSFEAGQGFFSNQPQHF  HomoSapiens        FLKEKGGL   
2  CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF  HomoSapiens        FLKEKGGL   
3     CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF  HomoSapiens        FLKEKGGL   
4   CAYRPPGTYKYIF        CASSALASLNEQFF  HomoSapiens        FLKEKGGL   

  antigen.gene  vdjdb.score  
0          Nef            2  
1          Nef            2  
2          Nef            2  
3          Nef            2  
4          Nef            2  
(9300, 6)


In [4]:
#Stick CRD3 together if both alpha and beta present if not leave alone
df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')
cdr3comb = df_slim['cdr3combined'].dropna()
cdr3comb = cdr3comb.reset_index(drop=True)
df_slim = df_slim.sort_values('antigen.epitope')
print(df_slim.head(3))
print(cdr3comb[0])

            cdr3.alpha          cdr3.beta      species antigen.epitope  \
19746    CAGAIPRDDKIIF   CASSLNPGRSDSPLHF  HomoSapiens       AAFKRSCLK   
19745  CALATHTGTASKLTF   CASSQDPGSSYNEQFF  HomoSapiens       AAFKRSCLK   
19744     CAGARNDYKLSF  CATSRDGAGLVNQPQHF  HomoSapiens       AAFKRSCLK   

      antigen.gene  vdjdb.score                     cdr3combined  
19746         T-Ag            3    CAGAIPRDDKIIFCASSLNPGRSDSPLHF  
19745         T-Ag            3  CALATHTGTASKLTFCASSQDPGSSYNEQFF  
19744         T-Ag            3    CAGARNDYKLSFCATSRDGAGLVNQPQHF  
CIVRAPGRADMRFCASSYLPGQGDHYSNQPQHF


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')


In [5]:
max_length = max(cdr3comb, key=lambda x: len(x))
print(len(max_length))


38


In [6]:
class TCRBertModel(tf.keras.Model):
    def __init__(self, bert_model_name='bert-base-uncased'): #uncased produces 728 dimensional embeddings
        super(TCRBertModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)

    def call(self, inputs):
        outputs = self.bert(inputs)
        sequence_output = outputs[0]
        return sequence_output

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #load the tokenizer for the BERT model

#tokenize the TCR sequences
tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38) #1500 is max before ResourceExhaustedError on local machine

#convert the tokenized sequences to tensors
input_ids = tf.constant(tokenized_inputs['input_ids'])
attention_mask = tf.constant(tokenized_inputs['attention_mask'])

#dictionary mapping input names to their values
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

model = TCRBertModel()

#get 768-dimensional embeddings for each token in the input sequences
outputs = model(inputs)

  from .autonotebook import tqdm as notebook_tqdm
  tokenized_inputs = tokenizer(df_slim['cdr3combined'][0:1500].tolist(), padding='max_length', truncation=True, max_length=38) #1500 is max before ResourceExhaustedError on local machine
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, 

In [17]:
print(outputs)

tf.Tensor(
[[[-6.74676895e-01 -9.55889672e-02  2.27257103e-01 ... -2.54138976e-01
    3.10018569e-01  5.83854079e-01]
  [ 2.77730018e-01 -9.40572396e-02  8.77284884e-01 ... -8.00789297e-01
   -1.50567651e-01  4.35288161e-01]
  [ 2.92913169e-01  4.59451556e-01  1.08082390e+00 ... -8.12520683e-01
   -2.80620456e-01  3.17230560e-02]
  ...
  [-3.54200393e-01  2.29425773e-01  7.44987309e-01 ... -1.03679508e-01
   -2.60174066e-01  1.34258354e-02]
  [-4.16917026e-01  4.15420309e-02  8.02270353e-01 ... -1.34316921e-01
   -1.87624186e-01  1.06884927e-01]
  [-4.90339339e-01  3.35844398e-01  7.81793892e-01 ... -1.80011958e-01
   -1.80875748e-01 -9.65448283e-03]]

 [[-7.20090151e-01 -1.10861301e-01  3.37439865e-01 ... -1.00468241e-01
    2.06924498e-01  7.44393647e-01]
  [-8.84186774e-02 -2.29257658e-01  8.88929188e-01 ... -3.72530460e-01
   -7.12003186e-02  4.31933343e-01]
  [-5.37012279e-01  3.01813960e-01  1.07349384e+00 ... -2.14626536e-01
    1.74486801e-01  2.32736960e-01]
  ...
  [-7.676709