This analysis is built upon the foundational work by @sergeifironov from the CAFA 5 competition, who published the acclaimed notebook titled "T5embeds calculation [only few samples]" I want to extend my gratitude for his insightful contribution.

This is the code for calculating embeddings from the t5 dataset for CAFA 6 : https://www.kaggle.com/datasets/ahsuna123/t5-embedding-cafa-6. Unfortunately, it is impossible to run it on Kaggle resources, even with a batch size of 1 you need A100 for evaluation.

In [None]:
!pip install obonet
!pip install pyvis

In [None]:
!pip install Bio

In [None]:
pip install transformers==4.39.3 --quiet

In [None]:
import os
import json
from typing import Dict
from collections import Counter

import random
import obonet
import pandas as pd
import numpy as np
from Bio import SeqIO
import torch
from transformers import T5Tokenizer, T5EncoderModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5Tokenizer.from_pretrained(
    "Rostlab/prot_t5_xl_half_uniref50-enc",
    do_lower_case=False
)

model = T5EncoderModel.from_pretrained(
    "Rostlab/prot_t5_xl_half_uniref50-enc"
).to(device)

if device.type == "cuda":
    model = model.half()

print("âœ… ProtT5 loaded successfully!")


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input/cafa-6-protein-function-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from pathlib import Path
path = Path('/kaggle/input/cafa-6-protein-function-prediction')
!head {path}/'Test (Targets)/testsuperset-taxon-list.tsv'

In [None]:
!head {path}/'Test (Targets)/testsuperset.fasta'

In [None]:
import re
def get_embeddings(seq):
    sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]

    ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")

    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # generate embeddings
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids,
                               attention_mask=attention_mask)

    # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7]) 
    emb_0 = embedding_repr.last_hidden_state[0]
    emb_0_per_protein = emb_0.mean(dim=0)
    
    return emb_0_per_protein

get_embeddings('MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS')


In [None]:
fn = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta'
print("Sequence example:\n\n", next(iter(SeqIO.parse(fn, "fasta"))))
sequences = SeqIO.parse(fn, "fasta")
num_sequences = sum(1 for seq in sequences)
print()
print("Number of sequences in train:", num_sequences)

In [None]:
import tqdm
fn = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta'

sequences = SeqIO.parse(fn, "fasta")

ids = []
embeds = np.zeros((num_sequences, 1024))
i = 0
for seq in tqdm.tqdm(sequences):
    ids.append(seq.id)
    embeds[i] = get_embeddings(str(seq.seq)).detach().cpu().numpy()
    i += 1
    break #remove it for full calculation
        
np.save('train_embeds.npy', embeds)
np.save('train_ids.npy', np.array(ids))

In [None]:
fn = '/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta'

sequences = SeqIO.parse(fn, "fasta")
num_sequences = sum(1 for seq in sequences)
print("Number of sequences in test:", num_sequences)
sequences = SeqIO.parse(fn, "fasta")


ids = []
embeds = np.zeros((num_sequences, 1024))
i = 0
for seq in tqdm.tqdm(sequences):
    ids.append(seq.id)
    embeds[i] = get_embeddings(str(seq.seq)).detach().cpu().numpy()
    i += 1
    break #remove it for full calculation
    
np.save('test_embeds.npy', embeds)
np.save('test_ids.npy', np.array(ids))