In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.columns


Index(['id', 'epitope_seq', 'antigen_seq', 'antigen_code', 'start_position',
       'end_position', 'number_of_tested', 'number_of_responses',
       'assay_method_technique', 'assay_group', 'disease_type',
       'disease_state', 'reference_date', 'reference_journal',
       'reference_title', 'reference_IRI', 'qualitative_label', 'label'],
      dtype='object')

In [2]:
import os
import torch
from transformers import T5EncoderModel, T5Tokenizer


# # Select visible gpus
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'

DEVICE = 'cuda:0'
MODEL_NAME = 'AutoML/prot_t5_xl_uniref50_finetune'

model = T5EncoderModel.from_pretrained(MODEL_NAME, use_auth_token=True).to(DEVICE)
print('model.device:', model.device)

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=False,use_auth_token=True)

Some weights of the model checkpoint at AutoML/prot_t5_xl_uniref50_finetune were not used when initializing T5EncoderModel: ['decoder.block.17.layer.0.SelfAttention.k.weight', 'decoder.block.10.layer.1.EncDecAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.6.layer.2.layer_norm.weight', 'decoder.block.3.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.2.layer_norm.weight', 'decoder.block.8.layer.0.layer_norm.weight', 'decoder.block.7.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.8.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.8.layer.2.DenseReluDense.wo.weight', 'decoder.block.9.layer.1.EncDec

model.device: cuda:0


In [3]:
def get_T5_emedding(model: T5EncoderModel, tokenizer: T5Tokenizer, batch_i: list) -> torch.Tensor:
    WINDOW_SIZE = 50



    # print(batch_i)
    batch_input_seq = []
    for i, row  in batch_i.iterrows():

        # print(type(df))
        # print(df)

        epi = row.epitope_seq
        ant = row.antigen_seq
        s_p = row.start_position
        e_p = row.end_position
 
        start_position = s_p-WINDOW_SIZE-1
        end_position = e_p+WINDOW_SIZE
        if start_position < 0:
            start_position = 0
        if end_position > len(ant):
            end_position = len(ant)
        
        left_antigen = ant[max(0,int(s_p) -WINDOW_SIZE-1): int(s_p)-1]
        right_antigen = ant[int(e_p) : int(min(end_position, len(ant)))]
        
        epitope = ' '.join(list(epi))

        left_antigen = ' '.join(list(left_antigen))

        right_antigen = ' '.join(list(right_antigen))

        input_seq = left_antigen+ "<epitope_left>" + epitope+'<epitope_right>'+ right_antigen
        
        batch_input_seq.append(input_seq)

    input = tokenizer(batch_input_seq, return_tensors='pt',padding = True)

    for k in input:

        input[k] = input[k].to(model.device)
    output = model(**input)

    for i, batch in enumerate(batch_input_seq):
        if i == 0:
            sequence_representations = output.last_hidden_state[i, 0: len(batch) -30,:].mean(0).unsqueeze(0)
        else:
            sequence_representations = torch.cat((sequence_representations,output.last_hidden_state[i, 0: len(batch) -30,:].mean(0).unsqueeze(0)),axis=0)
    return sequence_representations

# sequence embedding test
#get_T5_emedding(model, tokenizer, ['ADEEE','AADEEEAEAEAEAEAEAE'], ['AD','ADAEADAEAEEAE'] ,[2,3], [4,5])


In [4]:
from tqdm.auto import tqdm
import numpy as np


BATCH_SIZE = 8
batches = [df_train[i: i+BATCH_SIZE] for i in range(0, len(df_train), BATCH_SIZE)]


for i, batch in tqdm(enumerate(batches),total=len(batches)):
    if i ==0:
        epi_ant_emb = get_T5_emedding(model, tokenizer, batch).detach().cpu().numpy()
    else:
        epi_ant_emb = np.concatenate((epi_ant_emb, get_T5_emedding(model, tokenizer, batch).detach().cpu().numpy()),axis = 0)

np.save("./train_epitope_emb",epi_ant_emb)

  0%|          | 0/23852 [00:00<?, ?it/s]

In [None]:

BATCH_SIZE = 8
test_batches = [df_test[i: i+BATCH_SIZE] for i in range(0, len(df_test), BATCH_SIZE)]


for i, test_batch in tqdm(enumerate(test_batches),total=len(test_batches)):
    if i ==0:
        test_epitope_emb = get_T5_emedding(model, tokenizer, test_batch).detach().cpu().numpy()
    else:
        test_epitope_emb = np.concatenate((test_epitope_emb, get_T5_emedding(model, tokenizer, test_batch).detach().cpu().numpy()),axis = 0)

np.save("./test_epitope_emb",test_epitope_emb) ## 불러올 때 .npy 확장자 넣고 np.load

  0%|          | 0/15118 [00:00<?, ?it/s]

(8, 1024)
[[ 0.18906914  0.14560845 -0.1528612  ...  0.20469028 -0.11918849
   0.10591899]
 [ 0.4266055  -0.11792589  0.25343758 ... -0.01323998 -0.2479534
   0.4321695 ]
 [ 0.14318624  0.05829605  0.10749895 ...  0.09346101 -0.07912702
   0.12361471]
 ...
 [ 0.08996121  0.04125139  0.0521966  ...  0.05086026  0.05875093
   0.01759267]
 [ 0.10291646  0.07919775  0.10019596 ...  0.13095094 -0.04844337
   0.03152746]
 [ 0.2034303   0.03479502  0.10286329 ...  0.12990475 -0.13777411
   0.07409032]]
