In [1]:
from transformers import T5Tokenizer, T5EncoderModel
import re
import torch
import pandas as pd

def generate_protein_embeddings(sequence_list, model_name):
    if model_name not in ['prot_t5_xl_bfd', 'prot_t5_xl_uniref50']:
        raise ValueError("Invalid model name. Please choose either 'prot_t5_xl_bfd' or 'prot_t5_xl_uniref50'.")

    # Preprocess the sequence list
    sequence_list = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequence_list]
    sequence_lengths = [len(sequence) for sequence in sequence_list]
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Load the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(f'Rostlab/{model_name}', do_lower_case=False)
    model = T5EncoderModel.from_pretrained(f'Rostlab/{model_name}').to(device)

    # Set model precision based on the device
    if device.type == 'cpu':
        model.float()
    else:
        model.half()

    # Tokenize sequences and pad them up to the longest sequence in the batch
    sequence_list = [" ".join(list(sequence)) for sequence in sequence_list]
    ids = tokenizer(sequence_list, add_special_tokens=True, padding="longest")
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # Generate embeddings
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract residue embeddings for each sequence in the batch and remove padded & special tokens
    embeddings = [embedding_repr.last_hidden_state[i, :length] for i, length in enumerate(sequence_lengths)]

    # Derive a single representation (per-protein embedding) for the whole protein
    per_protein_embeddings = [emb.mean(dim=0) for emb in embeddings]

    # Convert embeddings to a DataFrame
    embeddings_df = pd.DataFrame([emb.cpu().numpy() for emb in per_protein_embeddings])
    return embeddings_df



In [2]:
dataset = pd.read_excel('MRSA-25-3-2024.xlsx', na_filter = False) 
sequence_list = dataset['seq']
peptide_sequence_list = []

# Example Usage by 'prot_t5_xl_uniref50' or 'prot_t5_xl_bfd' as the second parameter
embeddings_df = generate_protein_embeddings(sequence_list, 'prot_t5_xl_uniref50') 

# Display protT5 feature
embeddings_df 

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/11.3G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
