In [1]:
import torch
from transformers import AutoTokenizer, EsmModel
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "facebook/esm2_t6_8M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name).to(device).eval()

def generate_embeddings(fasta_path, output_npy, ids_output_npy):
    records = [{"ID": r.id.split('|')[1] if '|' in r.id else r.id, "Seq": str(r.seq)} 
               for r in SeqIO.parse(fasta_path, "fasta")]
    df = pd.DataFrame(records)
    
    all_embs = []
    batch_size = 32
    for i in tqdm(range(0, len(df), batch_size)):
        batch_seqs = df['Seq'].iloc[i : i + batch_size].tolist()
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embs.append(emb)
    
    np.save(output_npy, np.vstack(all_embs))
    np.save(ids_output_npy, df['ID'].values)

print("ðŸ§¬ Generowanie embeddingÃ³w TRAIN...")
generate_embeddings("../data/bronze/Train/train_sequences.fasta", "../data/gold/X_train_esm2.npy", "../data/gold/train_ids.npy")

print("ðŸ§¬ Generowanie embeddingÃ³w TEST...")
generate_embeddings("../data/bronze/Test/testsuperset.fasta", "../data/gold/X_test_esm2.npy", "../data/gold/test_protein_ids.npy")
print("âœ… Gotowe! Masz pliki .npy na dysku.")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ðŸ§¬ Generowanie embeddingÃ³w TRAIN...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2576/2576 [19:52<00:00,  2.16it/s]


ðŸ§¬ Generowanie embeddingÃ³w TEST...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7010/7010 [50:38<00:00,  2.31it/s]


âœ… Gotowe! Masz pliki .npy na dysku.
