**Language models for TCR specificity prediction**

In [47]:
import pandas as pd

file_path = "vdjdb_full.txt"
df = pd.read_csv(file_path, delimiter='\t')
df = df.drop_duplicates()
print(df.shape)

(61636, 34)


  df = pd.read_csv(file_path, delimiter='\t')


In [48]:
df = df[(df['vdjdb.score'] > 0)]
df_slim = df[['cdr3.alpha','cdr3.beta','species','antigen.epitope','antigen.gene','vdjdb.score']]
print(df_slim.head(5))
print(df_slim.shape)

       cdr3.alpha             cdr3.beta      species antigen.epitope  \
0   CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF  HomoSapiens        FLKEKGGL   
1             NaN   CASSFEAGQGFFSNQPQHF  HomoSapiens        FLKEKGGL   
2  CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF  HomoSapiens        FLKEKGGL   
3     CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF  HomoSapiens        FLKEKGGL   
4   CAYRPPGTYKYIF        CASSALASLNEQFF  HomoSapiens        FLKEKGGL   

  antigen.gene  vdjdb.score  
0          Nef            2  
1          Nef            2  
2          Nef            2  
3          Nef            2  
4          Nef            2  
(9300, 6)


In [49]:
#Stick CRD3 together if both alpha and beta present if not leave alone
df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')
cdr3comb = df_slim['cdr3combined'].dropna()
cdr3comb = cdr3comb.reset_index(drop=True)
df_slim = df_slim.sort_values('antigen.epitope')
print(df_slim.head(3))
print(cdr3comb[0])

            cdr3.alpha          cdr3.beta      species antigen.epitope  \
19746    CAGAIPRDDKIIF   CASSLNPGRSDSPLHF  HomoSapiens       AAFKRSCLK   
19745  CALATHTGTASKLTF   CASSQDPGSSYNEQFF  HomoSapiens       AAFKRSCLK   
19744     CAGARNDYKLSF  CATSRDGAGLVNQPQHF  HomoSapiens       AAFKRSCLK   

      antigen.gene  vdjdb.score                     cdr3combined  
19746         T-Ag            3    CAGAIPRDDKIIFCASSLNPGRSDSPLHF  
19745         T-Ag            3  CALATHTGTASKLTFCASSQDPGSSYNEQFF  
19744         T-Ag            3    CAGARNDYKLSFCATSRDGAGLVNQPQHF  
CIVRAPGRADMRFCASSYLPGQGDHYSNQPQHF


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slim['cdr3combined'] = df_slim['cdr3.alpha'].fillna('') + df_slim['cdr3.beta'].fillna('')


In [50]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer

class TCRBERT(tf.keras.Model):
    def __init__(self, hidden_dim=768, intermediate_dim=3072, num_attention_heads=12, num_transformer_layers=12):
        super(TCRBERT, self).__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-uncased')
        self.transformer_blocks = [TransformerBlock(hidden_dim, intermediate_dim, num_attention_heads) 
                                   for _ in range(num_transformer_layers)]
        self.maa_head = tf.keras.layers.Dense(20, activation='softmax')  # 20 amino acids

    def call(self, inputs, training=False):
        sequence_output, _ = self.bert(inputs)  # Get BERT sequence output
        for transformer_block in self.transformer_blocks:
            sequence_output = transformer_block(sequence_output, training=training)  # Apply transformer blocks
        logits = self.maa_head(sequence_output)  # MAA head
        return logits

#Transformer Block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, hidden_dim, intermediate_dim, num_attention_heads):
        super(TransformerBlock, self).__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_attention_heads, key_dim=hidden_dim)
        self.feed_forward = tf.keras.Sequential([
            tf.keras.layers.Dense(intermediate_dim, activation='relu'),
            tf.keras.layers.Dense(hidden_dim)
        ])
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)

    def call(self, inputs, training=False):
        attention_output = self.attention(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        attention_output = self.layer_norm1(inputs + attention_output)
        feed_forward_output = self.feed_forward(attention_output)
        feed_forward_output = self.dropout2(feed_forward_output, training=training)
        sequence_output = self.layer_norm2(attention_output + feed_forward_output)
        return sequence_output



In [51]:
X = df_slim['cdr3combined']
y = df_slim['antigen.epitope']

In [57]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#tokenize the sequences
inputs = tokenizer.batch_encode_plus(
    cdr3comb,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    return_attention_mask=True,
    padding='longest',  # Pad all sentences to the length of the longest sentence
    truncation=True,
    max_length=512,  # Max length to pad sequences to
)

# Get the input IDs (tokenized sequences), attention masks and labels
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']
labels = df_slim['antigen.epitope']

In [68]:
for key, tensor in inputs.items():
    first_value = tensor.numpy()[0]
    print(f"The first value of '{key}' is: {first_value}")

The first value of 'input_ids' is: [  101  6187 23805 18098 14141  3211 10128 15671 14540 16275 16523 16150
 13102  2140  2232  2546   102     0     0     0     0     0     0     0]
The first value of 'token_type_ids' is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
The first value of 'attention_mask' is: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
