In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [2]:
import os
import torch
from transformers import T5EncoderModel, T5Tokenizer
import numpy as np

torch.set_default_dtype(torch.half)
torch.set_default_tensor_type(torch.HalfTensor)

# # Select visible gpus
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

DEVICE = 'cuda:0'
MODEL_NAME = 'Rostlab/prot_t5_xl_half_uniref50-enc'

model = T5EncoderModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(DEVICE)
print('model.device:', model.device)

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

model.device: cuda:0


In [3]:
def get_T5_emedding(model: T5EncoderModel, tokenizer: T5Tokenizer, seq: str) -> torch.Tensor:
    input = tokenizer(' '.join(seq), return_tensors='pt')
    for k in input:
        input[k] = input[k].to(model.device)
    output = model(**input)
    return output.last_hidden_state.mean(1).view(-1)

# sequence embedding test
get_T5_emedding(model, tokenizer, 'ADE')

tensor([ 0.1368, -0.1732, -0.0472,  ...,  0.0454,  0.0054,  0.0166],
       device='cuda:0', grad_fn=<ViewBackward>)

In [4]:
from tqdm.auto import tqdm
epitope_emb = []
for epitope_seq in tqdm(df_train.epitope_seq):
    epitope_emb.append(get_T5_emedding(model, tokenizer, epitope_seq).cpu().tonumpy())


np.save("./epitope_emb",np.array(epitope_emb)) ## 불러올 때 .npy 확장자 넣고 np.load

  0%|          | 0/190811 [00:00<?, ?it/s]

In [6]:
test_epitope_emb = []
for epitope_seq in tqdm(df_test.epitope_seq):
    test_epitope_emb.append(get_T5_emedding(model, tokenizer, epitope_seq).cpu().tonumpy())

np.save("./epitope_emb",np.array(test_epitope_emb)) ## 불러올 때 .npy 확장자 넣고 np.load

  0%|          | 0/120944 [00:00<?, ?it/s]