List the Data Files

In [None]:
import os

# The directory where the competition data is stored
data_dir = '/kaggle/input/cafa-6-protein-function-prediction'

# List all the files in that directory
files = os.listdir(data_dir)
print(files)

Load the Training LabelsÂ¶

In [None]:
import pandas as pd

# Define the base directory for the competition data
data_dir = '/kaggle/input/cafa-6-protein-function-prediction'

# The training files are inside the 'Train' subdirectory
train_dir = os.path.join(data_dir, 'Train')

# Define the full, correct path to the training terms file
train_terms_path = os.path.join(train_dir, 'train_terms.tsv')

print(f"Attempting to load file from: {train_terms_path}")

# Load the data
train_df = pd.read_csv(train_terms_path, sep='\t')

# Display the size and first 5 rows to confirm it loaded
print(f"\nSuccess! Shape of the training labels DataFrame: {train_df.shape}")
display(train_df.head())

Load the Protein Sequence

In [None]:
def load_sequences_corrected(filepath):
    """
    Reads a FASTA file, correctly parsing the headers to extract the main protein ID,
    and returns a dictionary mapping these IDs to sequences.
    """
    sequences = {}
    current_id = ""
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                # Split the header by '|' and take the second element
                # e.g., from '>sp|Q9Y2X8|CD3E_HUMAN' we get 'Q9Y2X8'
                current_id = line.split('|')[1] 
                sequences[current_id] = ""
            else:
                sequences[current_id] += line
    return sequences

# --- RERUN THE LOADING AND ANALYSIS ---

# Define the file path for the sequences
train_seq_path = os.path.join(train_dir, 'train_sequences.fasta')

# Load the sequences using our NEW function
train_sequences = load_sequences_corrected(train_seq_path)

# Now, rerun the intersection check
label_protein_ids = set(train_df['EntryID'].unique())
sequence_protein_ids = set(train_sequences.keys())
common_protein_ids = label_protein_ids.intersection(sequence_protein_ids)

print(f"Number of proteins with labels: {len(label_protein_ids)}")
print(f"Number of proteins with sequences: {len(sequence_protein_ids)}")
print(f"Number of proteins we can use for training (in both sets): {len(common_protein_ids)}")

# --- Safely print an example ---
if common_protein_ids:
    example_protein_id = list(common_protein_ids)[0]
    
    print(f"\nExample of a common protein ID: '{example_protein_id}'")
    print(f"Sequence for this protein:")
    print(train_sequences[example_protein_id])
else:
    print("\nWarning: Still no common proteins found. The header format might be different.")

In [None]:
import numpy as np

# --- Step 1: Filter the DataFrame ---
# Keep only the labels for the proteins that we have sequences for.
train_df_filtered = train_df[train_df['EntryID'].isin(common_protein_ids)].copy()

print(f"Original number of labels: {len(train_df)}")
print(f"Number of labels for common proteins: {len(train_df_filtered)}")

# --- Step 2: Identify the Top 1,500 GO Terms ---
# Count the occurrences of each term
term_counts = train_df_filtered['term'].value_counts()
# Get the names of the top 1500 terms
top_terms = term_counts.head(1500).index.tolist()

print(f"\nTotal unique GO terms: {len(term_counts)}")
print(f"We will predict the top {len(top_terms)} most frequent terms.")

# --- Step 3: Filter the DataFrame Again ---
# Keep only the rows that correspond to one of the top 1500 terms.
train_df_top = train_df_filtered[train_df_filtered['term'].isin(top_terms)].copy()


# --- Step 4: Create the Binary Label Matrix (the "wide" format) ---
# Add a 'value' column of 1s to help with the pivot operation
train_df_top['value'] = 1

# Create the pivot table
# Rows: proteins (EntryID), Columns: functions (term), Values: 1 if present, 0 otherwise
labels_df = train_df_top.pivot_table(
    index='EntryID', 
    columns='term', 
    values='value', 
    fill_value=0
)

# --- Final Check ---
print(f"\nShape of our final labels DataFrame: {labels_df.shape}")
print("This means we have labels for", labels_df.shape[0], "proteins and", labels_df.shape[1], "unique functions.")
display(labels_df.head())

Prepare the Labels for the Model

In [None]:
import numpy as np

# --- Step 4.1: Filter the DataFrame ---
# We only want to work with labels for the proteins that we actually have sequences for.
train_df_filtered = train_df[train_df['EntryID'].isin(common_protein_ids)].copy()

print(f"Original number of labels: {len(train_df)}")
print(f"Number of labels for common proteins: {len(train_df_filtered)}")


# --- Step 4.2: Identify the Top 1,500 GO Terms ---
# First, we count how many times each GO term appears.
term_counts = train_df_filtered['term'].value_counts()
# Then, we get the names (the IDs) of the 1500 most frequent terms.
top_terms = term_counts.head(1500).index.tolist()

print(f"\nTotal unique GO terms: {len(term_counts)}")
print(f"We will focus on predicting the top {len(top_terms)} most frequent terms.")


# --- Step 4.3: Filter Again for Top Terms ---
# Now we create a new DataFrame that only includes the labels for our top terms.
train_df_top = train_df_filtered[train_df_filtered['term'].isin(top_terms)].copy()


# --- Step 4.4: Create the Binary Label Matrix ---
# **FIX:** Add a 'value' column containing all 1s. This is what pivot_table will use.
train_df_top['value'] = 1

# Now, create the pivot table.
# It uses proteins as the index (rows) and terms as the columns.
labels_df = train_df_top.pivot_table(
    index='EntryID',
    columns='term',
    values='value',
    fill_value=0
)


# --- Step 4.5: Final Check ---
print(f"\nShape of our final labels DataFrame: {labels_df.shape}")
print(f"This matrix has {labels_df.shape[0]} proteins and {labels_df.shape[1]} functions.")
print("\nHere's a preview of the final label matrix:")
display(labels_df.head())

In [None]:
# Rerunning the logic from CELL 3 and CELL 3.5 to redefine lost variables.
# This fixes the NameError.

# 1. Rerun CELL 3's core logic (Define initial labels_df)
train_df_filtered = train_df[train_df['EntryID'].isin(common_protein_ids)].copy()
term_counts = train_df_filtered['term'].value_counts()
top_terms = term_counts.head(1500).index.tolist()
train_df_top = train_df_filtered[train_df_filtered['term'].isin(top_terms)].copy()
train_df_top['value'] = 1
labels_df = train_df_top.pivot_table(index='EntryID', columns='term', values='value', fill_value=0)

# 2. Rerun CELL 3.5's core logic (Filter and define final_protein_ids)
final_protein_ids = labels_df.index.tolist()
MAX_SEQUENCE_LENGTH = 1000 # Using the established safe limit

ids_to_process_filtered = []
for pid in final_protein_ids:
    sequence = train_sequences.get(pid)
    if sequence and len(sequence) <= MAX_SEQUENCE_LENGTH:
        ids_to_process_filtered.append(pid)

final_protein_ids = ids_to_process_filtered
labels_df = labels_df.loc[final_protein_ids] # Re-filter the labels DataFrame

print("final_protein_ids has been redefined successfully. You can now run CELL 4.")

In [None]:
# --- Step 5.1: Install and Import Libraries ---
# We need the 'transformers' library from Hugging Face to load the ESM-2 model.
!pip install transformers

import torch
from transformers import EsmModel, EsmTokenizer
from tqdm import tqdm
import numpy as np

# --- Step 5.2: Set Up the Model ---
# Check if a GPU is available and set the device accordingly.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the pre-trained ESM-2 model and its tokenizer.
# We're using a smaller version to be efficient with our GPU quota.
model_name = "facebook/esm2_t30_150M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name).to(device)
model.eval() # Set the model to evaluation mode

# --- Step 5.3: Generate Embeddings ---
# We will store the embeddings in this dictionary.
protein_embeddings = {}

# Get the list of protein IDs we need to process (the ones in our label matrix).
ids_to_process = labels_df.index.tolist()

# Let's start with a small sample to test our code.
# Change this to len(ids_to_process) to run on the full dataset later.
sample_size = 100 

print(f"\nGenerating embeddings for the first {sample_size} proteins...")

# Use tqdm for a nice progress bar.
for protein_id in tqdm(ids_to_process[:sample_size]):
    sequence = train_sequences[protein_id]
    
    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=False).to(device)
    
    # Get the model's output (no gradient calculation needed)
    with torch.no_grad():
        outputs = model(**inputs)

    # The model outputs embeddings for each amino acid. We'll take the mean
    # across the sequence length to get a single embedding for the whole protein.
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    protein_embeddings[protein_id] = embedding

# --- Step 5.4: Final Check ---
print(f"\nSuccessfully generated {len(protein_embeddings)} embeddings.")
example_id = list(protein_embeddings.keys())[0]
print(f"The embedding for protein '{example_id}' has a shape of: {protein_embeddings[example_id].shape}")


In [None]:
import os
import torch
from transformers import EsmModel, EsmTokenizer
from tqdm import tqdm
import numpy as np
import gc # Garbage Collector

# --- Step 6.1: Set Up the Model ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model_name = "facebook/esm2_t30_150M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name).to(device)
model.eval()

# --- Step 6.2: Make the Process Resumable ---
embeddings_file = '/kaggle/working/protein_embeddings.npy'
protein_embeddings = {}

if os.path.exists(embeddings_file):
    print("Found existing embeddings file. Loading to resume progress...")
    # allow_pickle=True is necessary for loading a dictionary
    protein_embeddings = np.load(embeddings_file, allow_pickle=True).item()
    print(f"Resuming with {len(protein_embeddings)} embeddings already generated.")

# Get the list of protein IDs we need to process
all_ids = labels_df.index.tolist()
# Filter out the IDs we've already processed
ids_to_process = [pid for pid in all_ids if pid not in protein_embeddings]
print(f"Number of new proteins to process: {len(ids_to_process)}")


# --- Step 6.3: Generate Embeddings in Batches ---
batch_size = 16 # Process 16 proteins at a time
print(f"Generating embeddings with a batch size of {batch_size}...")

for i in tqdm(range(0, len(ids_to_process), batch_size)):
    batch_ids = ids_to_process[i:i+batch_size]
    sequences = [train_sequences[pid] for pid in batch_ids]
    
    # **THE FIX:** Tokenize with truncation to prevent out-of-memory errors
    inputs = tokenizer(
        sequences, 
        return_tensors="pt", 
        padding=True,          # Pad sequences to the same length within a batch
        truncation=True,       # Truncate sequences longer than the model's max length
        max_length=1022,       # A safe max length for ESM-2
        add_special_tokens=False
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract embeddings and store them
    for j, protein_id in enumerate(batch_ids):
        # We take the mean of the embeddings for all amino acids in the sequence
        embedding = outputs.last_hidden_state[j].mean(dim=0).cpu().numpy()
        protein_embeddings[protein_id] = embedding
    
    # Periodically save our progress
    if i % (batch_size * 100) == 0: # Save every 100 batches
        np.save(embeddings_file, protein_embeddings)
        
    # Clean up GPU memory
    del inputs, outputs
    gc.collect()
    torch.cuda.empty_cache()


# --- Step 6.4: Final Save ---
print(f"\nSuccessfully generated embeddings for {len(protein_embeddings)} total proteins.")
np.save(embeddings_file, protein_embeddings)
print(f"\nAll embeddings have been saved to '{embeddings_file}'.")
