# **ðŸ§©What Youâ€™re Predicting**

The competition aims to build a model that learns patterns from protein sequences and predicts their correct class or property using advanced models like ESM.

âœ… Type of Task: Multi-label classification                                           
âœ… Input: Protein amino acid sequence                                                 
âœ… Output: A list of GO terms for each protein

In [None]:
!pip install biopython


In [None]:
from Bio import SeqIO
import pandas as pd

In [None]:
fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
sequences = []
for record in SeqIO.parse(fasta_path, "fasta"):
    sequences.append({
        "protein_id": record.id.split('|')[1],  # Extract UniProt ID (e.g., P9WHI7)
        "sequence": str(record.seq)
    })


df_seq = pd.DataFrame(sequences)

In [None]:
df_seq.head()

In [None]:
df_seq.shape

In [None]:
labels_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv"
df_lab = pd.read_csv(labels_path, sep="\t")


In [None]:
df_lab.rename(columns={
    "EntryID": "protein_id",
    "term": "go_term",
    "aspect": "ontology"
}, inplace=True)


In [None]:
df_lab.head()

In [None]:
# Group GO terms for each protein
protein_go = df_lab.groupby("protein_id")["go_term"].apply(list).reset_index()

# Merge with sequences
df = df_seq.merge(protein_go, on="protein_id", how="inner")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib
import os
import gc

# Load Model
model_path = "/kaggle/input/esm2-t6-8m-ur50d"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# use less data
df_sampled = df.sample(n=50000, random_state=42).reset_index(drop=True)
sequences = df_sampled["sequence"].tolist()

# Helper Function
def embed_full_sequence(seq, tokenizer, model, device, max_length=256):
    """Splits long sequences into chunks, embeds each, averages them."""
    embeddings = []
    for i in range(0, len(seq), max_length):
        chunk = seq[i:i+max_length]
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        emb = outputs.last_hidden_state.mean(1).cpu().numpy()
        embeddings.append(emb)
        torch.cuda.empty_cache()
    return np.mean(embeddings, axis=0)

# Save Progressively
save_path = "/kaggle/working/esm2_50k_full_embeds.pkl"
embeddings = []
start_idx = 0

if os.path.exists(save_path):
    embeddings = joblib.load(save_path)
    start_idx = len(embeddings)
    print(f"Resuming from index {start_idx} / {len(sequences)}")

for idx in tqdm(range(start_idx, len(sequences)), total=len(sequences)):
    seq = sequences[idx]
    try:
        emb = embed_full_sequence(seq, tokenizer, model, device, max_length=256)
        embeddings.append(emb)

        # Save progress every 100 sequences
        if idx % 100 == 0:
            joblib.dump(embeddings, save_path)
            print(f"Saved up to index {idx}")

    except RuntimeError as e:
        print(f"Skipped {idx} due to memory error: {e}")
        torch.cuda.empty_cache()
        gc.collect()
        continue

# Final save
joblib.dump(embeddings, save_path)
print("embeddings saved safely.")
