# Baseline: ESM2 Embeddings + MLP Classifier

This notebook implements:
1. ESM2 embedding generation for train/test proteins
2. MLP classifier per aspect (MF, BP, CC)
3. GO term propagation
4. Submission generation

In [None]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import os
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Verify GPU
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

# Paths
DATA_DIR = Path('/home/data')
TRAIN_DIR = DATA_DIR / 'Train'
TEST_DIR = DATA_DIR / 'Test'
OUTPUT_DIR = Path('/home/code/experiments/001_baseline')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load training data
def parse_fasta(fasta_path):
    """Parse FASTA file and return dict of protein_id -> sequence"""
    sequences = {}
    current_id = None
    current_seq = []
    
    with open(fasta_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = ''.join(current_seq)
                # Extract protein ID (first part after >)
                current_id = line[1:].split()[0]
                # Handle sp|XXX|YYY format
                if '|' in current_id:
                    parts = current_id.split('|')
                    current_id = parts[1] if len(parts) > 1 else parts[0]
                current_seq = []
            else:
                current_seq.append(line)
        if current_id is not None:
            sequences[current_id] = ''.join(current_seq)
    
    return sequences

print('Loading train sequences...')
train_sequences = parse_fasta(TRAIN_DIR / 'train_sequences.fasta')
print(f'Loaded {len(train_sequences)} train sequences')

print('Loading test sequences...')
test_sequences = parse_fasta(TEST_DIR / 'testsuperset.fasta')
print(f'Loaded {len(test_sequences)} test sequences')

In [None]:
# Load training labels
train_terms = pd.read_csv(TRAIN_DIR / 'train_terms.tsv', sep='\t')
print(f'Train terms shape: {train_terms.shape}')
print(f'Unique proteins: {train_terms["EntryID"].nunique()}')
print(f'Unique GO terms: {train_terms["term"].nunique()}')
print(f'\nAspect distribution:')
print(train_terms['aspect'].value_counts())

In [None]:
# Load IA weights
ia_df = pd.read_csv(DATA_DIR / 'IA.tsv', sep='\t', header=None, names=['term', 'ia_weight'])
ia_weights = dict(zip(ia_df['term'], ia_df['ia_weight']))
print(f'Loaded {len(ia_weights)} IA weights')

In [None]:
# Parse GO ontology for propagation
def parse_go_obo(obo_path):
    """Parse GO OBO file to get parent-child relationships"""
    go_parents = defaultdict(set)  # child -> set of parents
    go_aspect = {}  # term -> aspect (P, F, C)
    
    current_term = None
    current_namespace = None
    is_obsolete = False
    
    with open(obo_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line == '[Term]':
                current_term = None
                current_namespace = None
                is_obsolete = False
            elif line.startswith('id: GO:'):
                current_term = line.split('id: ')[1]
            elif line.startswith('namespace:'):
                ns = line.split('namespace: ')[1]
                if ns == 'biological_process':
                    current_namespace = 'P'
                elif ns == 'molecular_function':
                    current_namespace = 'F'
                elif ns == 'cellular_component':
                    current_namespace = 'C'
            elif line.startswith('is_obsolete: true'):
                is_obsolete = True
            elif line.startswith('is_a: GO:') and current_term and not is_obsolete:
                parent = line.split('is_a: ')[1].split(' !')[0]
                go_parents[current_term].add(parent)
            elif line.startswith('relationship: part_of GO:') and current_term and not is_obsolete:
                parent = line.split('part_of ')[1].split(' !')[0]
                go_parents[current_term].add(parent)
            
            if current_term and current_namespace and not is_obsolete:
                go_aspect[current_term] = current_namespace
    
    return go_parents, go_aspect

print('Parsing GO ontology...')
go_parents, go_aspect = parse_go_obo(TRAIN_DIR / 'go-basic.obo')
print(f'Loaded {len(go_parents)} terms with parents')
print(f'Loaded {len(go_aspect)} terms with aspects')

In [None]:
# Get all ancestors for a term
def get_all_ancestors(term, go_parents):
    """Get all ancestors of a GO term"""
    ancestors = set()
    to_visit = list(go_parents.get(term, set()))
    
    while to_visit:
        parent = to_visit.pop()
        if parent not in ancestors:
            ancestors.add(parent)
            to_visit.extend(go_parents.get(parent, set()))
    
    return ancestors

# Root terms
ROOT_TERMS = {
    'GO:0003674': 'F',  # molecular_function
    'GO:0008150': 'P',  # biological_process  
    'GO:0005575': 'C'   # cellular_component
}

print('Root terms:', ROOT_TERMS)

In [None]:
# Load ESM2 model for embeddings
print('Loading ESM2 model...')
import esm

# Use esm2_t33_650M_UR50D for good balance of speed and quality
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.eval().cuda()
batch_converter = alphabet.get_batch_converter()

print('ESM2 model loaded successfully')

In [None]:
# Generate embeddings in batches
def generate_embeddings(sequences_dict, model, batch_converter, batch_size=8, max_len=1022):
    """Generate ESM2 embeddings for sequences"""
    embeddings = {}
    protein_ids = list(sequences_dict.keys())
    
    # Sort by length for efficient batching
    protein_ids_sorted = sorted(protein_ids, key=lambda x: len(sequences_dict[x]))
    
    with torch.no_grad():
        for i in tqdm(range(0, len(protein_ids_sorted), batch_size)):
            batch_ids = protein_ids_sorted[i:i+batch_size]
            batch_data = [(pid, sequences_dict[pid][:max_len]) for pid in batch_ids]
            
            batch_labels, batch_strs, batch_tokens = batch_converter(batch_data)
            batch_tokens = batch_tokens.cuda()
            
            results = model(batch_tokens, repr_layers=[33], return_contacts=False)
            token_representations = results['representations'][33]
            
            # Mean pooling over sequence length (excluding BOS and EOS tokens)
            for j, pid in enumerate(batch_ids):
                seq_len = len(sequences_dict[pid][:max_len])
                emb = token_representations[j, 1:seq_len+1].mean(dim=0).cpu().numpy()
                embeddings[pid] = emb
    
    return embeddings

# Generate train embeddings
print('Generating train embeddings...')
train_embeddings = generate_embeddings(train_sequences, model, batch_converter, batch_size=8)
print(f'Generated {len(train_embeddings)} train embeddings')

In [None]:
# Save train embeddings
train_emb_path = OUTPUT_DIR / 'train_embeddings.npz'
np.savez_compressed(train_emb_path, 
                    ids=np.array(list(train_embeddings.keys())),
                    embeddings=np.array(list(train_embeddings.values())))
print(f'Saved train embeddings to {train_emb_path}')

In [None]:
# Generate test embeddings
print('Generating test embeddings...')
test_embeddings = generate_embeddings(test_sequences, model, batch_converter, batch_size=8)
print(f'Generated {len(test_embeddings)} test embeddings')

# Save test embeddings
test_emb_path = OUTPUT_DIR / 'test_embeddings.npz'
np.savez_compressed(test_emb_path,
                    ids=np.array(list(test_embeddings.keys())),
                    embeddings=np.array(list(test_embeddings.values())))
print(f'Saved test embeddings to {test_emb_path}')

In [None]:
# Free GPU memory
del model
torch.cuda.empty_cache()
print('Freed GPU memory')

In [None]:
# Prepare data for training
# Get GO terms per aspect
aspect_terms = {}
for aspect in ['P', 'F', 'C']:
    terms = train_terms[train_terms['aspect'] == aspect]['term'].unique()
    aspect_terms[aspect] = sorted(terms)
    print(f'Aspect {aspect}: {len(terms)} unique terms')

# Create term to index mapping per aspect
term_to_idx = {}
idx_to_term = {}
for aspect, terms in aspect_terms.items():
    term_to_idx[aspect] = {term: i for i, term in enumerate(terms)}
    idx_to_term[aspect] = {i: term for i, term in enumerate(terms)}

In [None]:
# Create training labels matrix per aspect
def create_label_matrix(train_terms, train_embeddings, term_to_idx, aspect):
    """Create label matrix for a specific aspect"""
    protein_ids = list(train_embeddings.keys())
    n_proteins = len(protein_ids)
    n_terms = len(term_to_idx[aspect])
    
    # Create protein to index mapping
    protein_to_idx = {pid: i for i, pid in enumerate(protein_ids)}
    
    # Create label matrix
    labels = np.zeros((n_proteins, n_terms), dtype=np.float32)
    
    # Filter terms for this aspect
    aspect_data = train_terms[train_terms['aspect'] == aspect]
    
    for _, row in aspect_data.iterrows():
        pid = row['EntryID']
        term = row['term']
        if pid in protein_to_idx and term in term_to_idx[aspect]:
            labels[protein_to_idx[pid], term_to_idx[aspect][term]] = 1.0
    
    return labels, protein_ids

# Create label matrices
label_matrices = {}
for aspect in ['P', 'F', 'C']:
    labels, protein_ids = create_label_matrix(train_terms, train_embeddings, term_to_idx, aspect)
    label_matrices[aspect] = labels
    print(f'Aspect {aspect}: labels shape {labels.shape}, positive rate: {labels.mean():.4f}')

In [None]:
# Create embedding matrix
protein_ids = list(train_embeddings.keys())
X_train = np.array([train_embeddings[pid] for pid in protein_ids])
print(f'X_train shape: {X_train.shape}')

In [None]:
# Define MLP model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLP model defined')

In [None]:
# Training function with 5-fold CV
def train_model_cv(X, y, aspect, n_folds=5, epochs=10, batch_size=128, lr=0.001):
    """Train MLP with cross-validation"""
    input_dim = X.shape[1]
    output_dim = y.shape[1]
    hidden_dims = [864, 712]
    
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    fold_scores = []
    all_val_preds = np.zeros_like(y)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f'\nFold {fold+1}/{n_folds}')
        
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        # Create dataloaders
        train_dataset = TensorDataset(
            torch.FloatTensor(X_tr),
            torch.FloatTensor(y_tr)
        )
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Initialize model
        model = MLPClassifier(input_dim, hidden_dims, output_dim, dropout=0.2).cuda()
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
        criterion = nn.BCEWithLogitsLoss()
        
        # Training loop
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
                
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                total_loss += loss.item()
            
            scheduler.step()
            
            if (epoch + 1) % 5 == 0:
                print(f'  Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_preds = torch.sigmoid(model(torch.FloatTensor(X_val).cuda())).cpu().numpy()
        
        all_val_preds[val_idx] = val_preds
        
        # Calculate F1 score (simplified)
        threshold = 0.1
        y_pred_binary = (val_preds > threshold).astype(int)
        y_true_binary = y_val.astype(int)
        
        # Micro F1
        tp = np.sum(y_pred_binary * y_true_binary)
        fp = np.sum(y_pred_binary * (1 - y_true_binary))
        fn = np.sum((1 - y_pred_binary) * y_true_binary)
        
        precision = tp / (tp + fp + 1e-10)
        recall = tp / (tp + fn + 1e-10)
        f1 = 2 * precision * recall / (precision + recall + 1e-10)
        
        fold_scores.append(f1)
        print(f'  Fold {fold+1} F1: {f1:.4f}')
    
    mean_f1 = np.mean(fold_scores)
    std_f1 = np.std(fold_scores)
    print(f'\nAspect {aspect} - Mean F1: {mean_f1:.4f} ± {std_f1:.4f}')
    
    return mean_f1, std_f1, all_val_preds

In [None]:
# Train models for each aspect
results = {}
for aspect in ['P', 'F', 'C']:
    print(f'\n{"="*50}')
    print(f'Training model for aspect {aspect}')
    print(f'{"="*50}')
    
    y = label_matrices[aspect]
    mean_f1, std_f1, val_preds = train_model_cv(X_train, y, aspect, n_folds=5, epochs=10)
    results[aspect] = {'mean_f1': mean_f1, 'std_f1': std_f1}

# Overall CV score
overall_f1 = np.mean([results[a]['mean_f1'] for a in ['P', 'F', 'C']])
print(f'\n{"="*50}')
print(f'Overall CV F1: {overall_f1:.4f}')
print(f'{"="*50}')

In [None]:
# Train final models on full data for submission
def train_final_model(X, y, epochs=10, batch_size=128, lr=0.001):
    """Train final model on full data"""
    input_dim = X.shape[1]
    output_dim = y.shape[1]
    hidden_dims = [864, 712]
    
    train_dataset = TensorDataset(
        torch.FloatTensor(X),
        torch.FloatTensor(y)
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    model = MLPClassifier(input_dim, hidden_dims, output_dim, dropout=0.2).cuda()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.BCEWithLogitsLoss()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
        
        scheduler.step()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')
    
    return model

final_models = {}
for aspect in ['P', 'F', 'C']:
    print(f'\nTraining final model for aspect {aspect}')
    y = label_matrices[aspect]
    final_models[aspect] = train_final_model(X_train, y, epochs=10)

In [None]:
# Generate predictions for test set
print('Generating test predictions...')

# Create test embedding matrix
test_protein_ids = list(test_embeddings.keys())
X_test = np.array([test_embeddings[pid] for pid in test_protein_ids])
print(f'X_test shape: {X_test.shape}')

# Generate predictions per aspect
test_predictions = {}
for aspect in ['P', 'F', 'C']:
    print(f'Predicting for aspect {aspect}...')
    model = final_models[aspect]
    model.eval()
    
    # Batch prediction
    batch_size = 256
    all_preds = []
    
    with torch.no_grad():
        for i in range(0, len(X_test), batch_size):
            batch_X = torch.FloatTensor(X_test[i:i+batch_size]).cuda()
            preds = torch.sigmoid(model(batch_X)).cpu().numpy()
            all_preds.append(preds)
    
    test_predictions[aspect] = np.vstack(all_preds)
    print(f'  Predictions shape: {test_predictions[aspect].shape}')

In [None]:
# Apply GO term propagation
def propagate_predictions(protein_id, predictions, aspect, idx_to_term, go_parents, threshold=0.01):
    """Propagate predictions to parent terms"""
    term_scores = {}
    
    # Get predictions above threshold
    for idx, score in enumerate(predictions):
        if score > threshold:
            term = idx_to_term[aspect][idx]
            term_scores[term] = max(term_scores.get(term, 0), score)
            
            # Propagate to ancestors
            ancestors = get_all_ancestors(term, go_parents)
            for ancestor in ancestors:
                term_scores[ancestor] = max(term_scores.get(ancestor, 0), score)
    
    # Add root term with score 1.0
    root_term = {'P': 'GO:0008150', 'F': 'GO:0003674', 'C': 'GO:0005575'}[aspect]
    term_scores[root_term] = 1.0
    
    return term_scores

print('Propagating predictions...')

In [None]:
# Generate submission
print('Generating submission...')

submission_rows = []
thresholds = {'P': 0.05, 'F': 0.1, 'C': 0.1}

for i, protein_id in enumerate(tqdm(test_protein_ids)):
    for aspect in ['P', 'F', 'C']:
        predictions = test_predictions[aspect][i]
        threshold = thresholds[aspect]
        
        # Propagate predictions
        term_scores = propagate_predictions(
            protein_id, predictions, aspect, 
            idx_to_term, go_parents, threshold=threshold
        )
        
        # Add to submission
        for term, score in term_scores.items():
            submission_rows.append((protein_id, term, score))

print(f'Total submission rows: {len(submission_rows)}')

In [None]:
# Create submission dataframe and save
submission_df = pd.DataFrame(submission_rows, columns=['protein_id', 'GO_term', 'confidence'])

# Sort by protein_id and confidence
submission_df = submission_df.sort_values(['protein_id', 'confidence'], ascending=[True, False])

# Save submission (TSV format, no header)
submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, sep='\t', header=False, index=False)

print(f'Submission saved to {submission_path}')
print(f'Submission shape: {submission_df.shape}')
print(f'\nSample submission:')
print(submission_df.head(20))

In [None]:
# Print final results
print('\n' + '='*50)
print('FINAL RESULTS')
print('='*50)
for aspect in ['P', 'F', 'C']:
    print(f"Aspect {aspect}: F1 = {results[aspect]['mean_f1']:.4f} ± {results[aspect]['std_f1']:.4f}")
print(f'\nOverall CV F1: {overall_f1:.4f}')
print('='*50)