In [1]:
# Standard imports
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

print("‚úÖ Imports successful")

‚úÖ Imports successful


In [2]:
# ========================================
# CONFIGURATION
# ========================================
ENVIRONMENT = 'local'  # 'local' or 'kaggle'

# Optimal thresholds from notebook 04
THRESHOLDS = {
    'MF': 0.40,
    'BP': 0.20,
    'CC': 0.40
}

MAX_TERMS_PER_PROTEIN = 1500

print(f"üîß Environment: {ENVIRONMENT.upper()}")
print(f"üìä Thresholds: MF={THRESHOLDS['MF']}, BP={THRESHOLDS['BP']}, CC={THRESHOLDS['CC']}")

üîß Environment: LOCAL
üìä Thresholds: MF=0.4, BP=0.2, CC=0.4


In [3]:
# Set paths
if ENVIRONMENT == 'kaggle':
    base_dir = Path("/kaggle/input/cafa-6-dataset")
    output_dir = Path("/kaggle/working")
else:
    if Path.cwd().name == 'notebooks':
        base_dir = Path.cwd().parent
    else:
        base_dir = Path.cwd()
    output_dir = base_dir / "submissions"

# Create output directory
output_dir.mkdir(exist_ok=True)

# Data paths
TEST_FASTA = base_dir / 'Test' / 'testsuperset.fasta'
TRAIN_TERMS = base_dir / 'Train' / 'train_terms.tsv'
KNN_OUTPUT = base_dir / 'outputs' / 'knn_baseline'

print(f"üìÅ Base directory: {base_dir}")
print(f"üìÅ Output directory: {output_dir}")
print(f"üìÑ Test FASTA: {TEST_FASTA.name}")

üìÅ Base directory: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction
üìÅ Output directory: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions
üìÑ Test FASTA: testsuperset.fasta


## 1. Load Test Protein IDs

In [4]:
from Bio import SeqIO

print("Loading test proteins...")

test_proteins = []
test_sequences = {}

for record in SeqIO.parse(TEST_FASTA, "fasta"):
    header = record.id
    # Handle different FASTA formats
    if "|" in header:
        parts = header.split("|")
        protein_id = parts[1] if len(parts) >= 2 else header
    else:
        protein_id = header.split()[0]
    
    test_proteins.append(protein_id)
    test_sequences[protein_id] = str(record.seq)

print(f"‚úÖ Loaded {len(test_proteins):,} test proteins")
print(f"   First 5: {test_proteins[:5]}")

Loading test proteins...
‚úÖ Loaded 224,309 test proteins
   First 5: ['A0A0C5B5G6', 'A0A1B0GTW7', 'A0JNW5', 'A0JP26', 'A0PK11']
‚úÖ Loaded 224,309 test proteins
   First 5: ['A0A0C5B5G6', 'A0A1B0GTW7', 'A0JNW5', 'A0JP26', 'A0PK11']


## 2. Load KNN Model Artifacts

In [5]:
import json
import pickle

print("Loading KNN model artifacts...")

# Check if KNN outputs exist
if not KNN_OUTPUT.exists():
    raise FileNotFoundError(
        f"KNN outputs not found at {KNN_OUTPUT}\n"
        "Run notebook 02_baseline_knn.ipynb first to generate predictions."
    )

# Load metadata
with open(KNN_OUTPUT / "metadata.json", 'r') as f:
    metadata = json.load(f)

print(f"‚úÖ KNN Model: k={metadata['k_neighbors']}")
print(f"   Embedding: {metadata['embedding_model']}")

# Handle both old and new metadata formats
if 'aspect_specific_f1' in metadata:
    print(f"   Validation F1: {metadata['aspect_specific_f1']:.4f} (aspect-specific)")
    # Use optimal thresholds from metadata if available
    if 'optimal_thresholds' in metadata:
        print(f"   Optimal thresholds: MF={metadata['optimal_thresholds']['MF']}, "
              f"BP={metadata['optimal_thresholds']['BP']}, CC={metadata['optimal_thresholds']['CC']}")
elif 'best_f1' in metadata:
    print(f"   Validation F1: {metadata['best_f1']:.4f} (single threshold)")
else:
    print("   Validation F1: (not available)")

# Load vocabulary
vocab_path = KNN_OUTPUT / "vocab.json"
if vocab_path.exists():
    with open(vocab_path, 'r') as f:
        vocab = json.load(f)
    print(f"   Vocabulary size: {len(vocab)} terms")
else:
    print("   ‚ö†Ô∏è vocab.json not found, will extract from predictions")
    vocab = None

Loading KNN model artifacts...
‚úÖ KNN Model: k=10
   Embedding: facebook/esm2_t6_8M_UR50D
   Validation F1: 0.2604 (aspect-specific)
   Optimal thresholds: MF=0.5, BP=0.25, CC=0.35
   ‚ö†Ô∏è vocab.json not found, will extract from predictions


In [6]:
# Load term-to-aspect mapping
print("\nLoading term-to-aspect mapping...")

train_terms_df = pd.read_csv(TRAIN_TERMS, sep='\t')
term_to_aspect = dict(zip(train_terms_df['term'], train_terms_df['aspect']))

# Map aspect letters to names
aspect_letter_to_name = {'F': 'MF', 'P': 'BP', 'C': 'CC'}

print(f"‚úÖ Loaded aspect mapping for {len(term_to_aspect):,} terms")


Loading term-to-aspect mapping...
‚úÖ Loaded aspect mapping for 26,125 terms
‚úÖ Loaded aspect mapping for 26,125 terms


## 3. Generate Predictions for Test Set

**Note:** This requires the KNN model and embeddings. If not available, we'll use a placeholder approach.

In [7]:
# Check if we have pre-computed test predictions
test_predictions_path = KNN_OUTPUT / "test_predictions.parquet"

if test_predictions_path.exists():
    print("Loading pre-computed test predictions...")
    test_pred_df = pd.read_parquet(test_predictions_path)
    print(f"‚úÖ Loaded {len(test_pred_df):,} predictions")
else:
    print("‚ö†Ô∏è Pre-computed test predictions not found.")
    print("   Need to run KNN inference on test set.")
    print("   This requires:")
    print("   1. Test sequence embeddings")
    print("   2. Training embeddings + labels")
    print("   3. KNN model")
    print("\n   For now, creating a placeholder...")
    test_pred_df = None

Loading pre-computed test predictions...
‚úÖ Loaded 9,802,962 predictions
‚úÖ Loaded 9,802,962 predictions


In [8]:
# If no pre-computed predictions, FAIL - don't generate garbage
if test_pred_df is None:
    print("="*60)
    print("‚ùå CANNOT GENERATE SUBMISSION")
    print("="*60)
    
    print("\n‚ö†Ô∏è  test_predictions.parquet not found!")
    print(f"    Expected at: {test_predictions_path}")
    
    print("\nüìã Required steps:")
    print("   1. Run notebook 02_baseline_knn.ipynb")
    print("   2. Set GENERATE_TEST_PREDICTIONS = True")
    print("   3. Re-run this notebook")
    
    print("\nüí° The notebook will NOT generate placeholder predictions.")
    print("   This prevents submitting meaningless results.")
    
    raise FileNotFoundError(
        f"Test predictions not found at {test_predictions_path}. "
        "Run notebook 02 with GENERATE_TEST_PREDICTIONS=True first."
    )

## 4. Apply Aspect-Specific Thresholds

In [9]:
def apply_aspect_thresholds(pred_df, term_to_aspect, thresholds, aspect_letter_to_name):
    """
    Filter predictions using aspect-specific thresholds.
    
    Args:
        pred_df: DataFrame with [EntryID, term, probability]
        term_to_aspect: Dict mapping term -> aspect letter (F/P/C)
        thresholds: Dict mapping aspect name -> threshold
        aspect_letter_to_name: Dict mapping F->MF, P->BP, C->CC
    
    Returns:
        Filtered DataFrame
    """
    print("Applying aspect-specific thresholds...")
    
    # Add aspect column
    pred_df = pred_df.copy()
    pred_df['aspect_letter'] = pred_df['term'].map(term_to_aspect)
    pred_df['aspect'] = pred_df['aspect_letter'].map(aspect_letter_to_name)
    
    # Get threshold for each row
    pred_df['threshold'] = pred_df['aspect'].map(thresholds)
    
    # Handle missing aspects (use highest threshold as default)
    default_threshold = max(thresholds.values())
    pred_df['threshold'] = pred_df['threshold'].fillna(default_threshold)
    
    # Filter by threshold
    filtered = pred_df[pred_df['probability'] >= pred_df['threshold']].copy()
    
    print(f"   Before filtering: {len(pred_df):,} predictions")
    print(f"   After filtering:  {len(filtered):,} predictions")
    
    # Stats per aspect
    for aspect in ['MF', 'BP', 'CC']:
        before = len(pred_df[pred_df['aspect'] == aspect])
        after = len(filtered[filtered['aspect'] == aspect])
        print(f"   {aspect}: {before:,} ‚Üí {after:,} (threshold={thresholds[aspect]})")
    
    return filtered[['EntryID', 'term', 'probability']]

In [10]:
if test_pred_df is not None:
    filtered_df = apply_aspect_thresholds(
        test_pred_df, 
        term_to_aspect, 
        THRESHOLDS, 
        aspect_letter_to_name
    )
else:
    print("‚ùå No predictions to filter")
    filtered_df = None

Applying aspect-specific thresholds...
   Before filtering: 9,802,962 predictions
   After filtering:  3,153,866 predictions
   Before filtering: 9,802,962 predictions
   After filtering:  3,153,866 predictions
   MF: 1,925,633 ‚Üí 425,899 (threshold=0.4)
   MF: 1,925,633 ‚Üí 425,899 (threshold=0.4)
   BP: 5,350,528 ‚Üí 2,198,926 (threshold=0.2)
   BP: 5,350,528 ‚Üí 2,198,926 (threshold=0.2)
   CC: 2,526,801 ‚Üí 529,041 (threshold=0.4)
   CC: 2,526,801 ‚Üí 529,041 (threshold=0.4)


## 5. Limit Terms Per Protein

In [11]:
def limit_terms_per_protein(pred_df, max_terms=1500):
    """
    Limit predictions to top N terms per protein by confidence.
    
    Args:
        pred_df: DataFrame with [EntryID, term, probability]
        max_terms: Maximum terms per protein
    
    Returns:
        Limited DataFrame
    """
    print(f"Limiting to max {max_terms} terms per protein...")
    
    # Sort by protein and probability (descending)
    sorted_df = pred_df.sort_values(
        ['EntryID', 'probability'], 
        ascending=[True, False]
    )
    
    # Keep top N per protein
    limited = sorted_df.groupby('EntryID').head(max_terms)
    
    # Check how many proteins were affected
    terms_per_protein = pred_df.groupby('EntryID').size()
    affected = (terms_per_protein > max_terms).sum()
    
    print(f"   Proteins with >{max_terms} terms: {affected:,}")
    print(f"   Total predictions: {len(pred_df):,} ‚Üí {len(limited):,}")
    
    return limited

In [12]:
if filtered_df is not None:
    limited_df = limit_terms_per_protein(filtered_df, MAX_TERMS_PER_PROTEIN)
else:
    limited_df = None

Limiting to max 1500 terms per protein...
   Proteins with >1500 terms: 0
   Total predictions: 3,153,866 ‚Üí 3,153,866
   Proteins with >1500 terms: 0
   Total predictions: 3,153,866 ‚Üí 3,153,866


## 6. Format and Save Submission

In [13]:
def create_submission(pred_df, output_path, version="v1"):
    """
    Create competition submission file.
    
    Format: EntryID<tab>GO_term<tab>confidence
    No header, confidence with 3 decimal places.
    """
    print(f"Creating submission file...")
    
    # Prepare submission DataFrame
    submission = pred_df.copy()
    submission = submission.rename(columns={'term': 'GO_term', 'probability': 'confidence'})
    
    # Format confidence to 3 decimal places
    submission['confidence'] = submission['confidence'].round(3)
    
    # Ensure confidence > 0 (competition requirement)
    submission['confidence'] = submission['confidence'].clip(lower=0.001)
    
    # Sort by EntryID then confidence (descending)
    submission = submission.sort_values(
        ['EntryID', 'confidence'], 
        ascending=[True, False]
    )
    
    # Save
    submission[['EntryID', 'GO_term', 'confidence']].to_csv(
        output_path, 
        sep='\t', 
        header=False, 
        index=False
    )
    
    print(f"\n‚úÖ Saved to: {output_path}")
    print(f"   Total rows: {len(submission):,}")
    print(f"   Unique proteins: {submission['EntryID'].nunique():,}")
    print(f"   Unique GO terms: {submission['GO_term'].nunique():,}")
    print(f"   Avg terms/protein: {len(submission) / submission['EntryID'].nunique():.1f}")
    print(f"   Confidence range: [{submission['confidence'].min():.3f}, {submission['confidence'].max():.3f}]")
    
    return submission

In [14]:
if limited_df is not None:
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    submission_filename = f"submission_knn_aspect_{timestamp}.tsv"
    submission_path = output_dir / submission_filename
    
    submission = create_submission(limited_df, submission_path)
else:
    print("‚ùå No predictions available for submission")

Creating submission file...

‚úÖ Saved to: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions\submission_knn_aspect_20251125_1255.tsv
   Total rows: 3,153,866
   Unique proteins: 224,309

‚úÖ Saved to: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions\submission_knn_aspect_20251125_1255.tsv
   Total rows: 3,153,866
   Unique proteins: 224,309
   Unique GO terms: 22,530
   Avg terms/protein: 14.1
   Confidence range: [0.200, 1.000]
   Unique GO terms: 22,530
   Avg terms/protein: 14.1
   Confidence range: [0.200, 1.000]


## 7. Validate Submission

In [15]:
def validate_submission(filepath):
    """
    Validate submission file format for CAFA-6 competition.
    """
    print(f"Validating: {filepath}")
    print("=" * 50)
    
    # Load submission
    df = pd.read_csv(filepath, sep='\t', header=None, 
                     names=['EntryID', 'GO_term', 'confidence'])
    
    errors = []
    
    # Check 1: Three columns
    if len(df.columns) != 3:
        errors.append(f"Expected 3 columns, got {len(df.columns)}")
    
    # Check 2: Confidence range (0, 1]
    df['confidence'] = df['confidence'].astype(float)
    if df['confidence'].min() <= 0:
        errors.append(f"Confidence must be > 0 (min: {df['confidence'].min()})")
    if df['confidence'].max() > 1:
        errors.append(f"Confidence must be <= 1 (max: {df['confidence'].max()})")
    
    # Check 3: Max 1500 terms per protein
    terms_per_protein = df.groupby('EntryID').size()
    if terms_per_protein.max() > 1500:
        errors.append(f"Max 1500 terms/protein (found {terms_per_protein.max()})")
    
    # Check 4: GO term format
    invalid_terms = ~df['GO_term'].str.match(r'^GO:\d{7}$')
    if invalid_terms.any():
        bad_examples = df.loc[invalid_terms, 'GO_term'].head(3).tolist()
        errors.append(f"Invalid GO term format: {bad_examples}")
    
    # Check 5: No duplicates
    duplicates = df.duplicated(subset=['EntryID', 'GO_term']).sum()
    if duplicates > 0:
        errors.append(f"Found {duplicates} duplicate (EntryID, GO_term) pairs")
    
    if errors:
        print("‚ùå VALIDATION FAILED:")
        for error in errors:
            print(f"   - {error}")
        return False
    else:
        print("‚úÖ VALIDATION PASSED!")
        print(f"\nüìä Summary:")
        print(f"   Total rows: {len(df):,}")
        print(f"   Unique proteins: {df['EntryID'].nunique():,}")
        print(f"   Unique GO terms: {df['GO_term'].nunique():,}")
        print(f"   Avg terms/protein: {terms_per_protein.mean():.1f}")
        print(f"   Max terms/protein: {terms_per_protein.max()}")
        print(f"   Median confidence: {df['confidence'].median():.3f}")
        return True

In [16]:
if limited_df is not None:
    is_valid = validate_submission(submission_path)
    
    if is_valid:
        print(f"\nüéâ Submission ready for upload: {submission_path}")

Validating: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions\submission_knn_aspect_20251125_1255.tsv
‚úÖ VALIDATION PASSED!

üìä Summary:
   Total rows: 3,153,866
   Unique proteins: 224,309
‚úÖ VALIDATION PASSED!

üìä Summary:
   Total rows: 3,153,866
   Unique proteins: 224,309
   Unique GO terms: 22,530
   Avg terms/protein: 14.1
   Max terms/protein: 304
   Median confidence: 0.333

üéâ Submission ready for upload: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions\submission_knn_aspect_20251125_1255.tsv
   Unique GO terms: 22,530
   Avg terms/protein: 14.1
   Max terms/protein: 304
   Median confidence: 0.333

üéâ Submission ready for upload: c:\Users\Olale\Documents\Codebase\Science\cafa-6-protein-function-prediction\submissions\submission_knn_aspect_20251125_1255.tsv


## 8. Preview Submission

In [17]:
if limited_df is not None:
    print("First 20 rows of submission:")
    print("=" * 50)
    
    preview = pd.read_csv(submission_path, sep='\t', header=None, nrows=20,
                          names=['EntryID', 'GO_term', 'confidence'])
    print(preview.to_string(index=False))

First 20 rows of submission:
   EntryID    GO_term  confidence
A0A017SE81 GO:0016020       1.000
A0A017SE81 GO:0004745       0.999
A0A017SE81 GO:0047023       0.751
A0A017SE81 GO:0047044       0.751
A0A017SE81 GO:0047024       0.501
A0A017SE81 GO:0000140       0.501
A0A017SE81 GO:0008611       0.501
A0A017SE81 GO:0005737       0.501
A0A017SE81 GO:0004303       0.500
A0A017SE81 GO:0047035       0.500
A0A017SE81 GO:0005515       0.500
A0A017SE81 GO:0062175       0.250
A0A017SE81 GO:0006710       0.250
A0A017SE81 GO:0006355       0.250
A0A017SE81 GO:0050873       0.250
A0A017SE81 GO:0010468       0.250
A0A017SE81 GO:0030223       0.250
A0A017SE81 GO:0120161       0.250
A0A017SE81 GO:0006656       0.250
A0A017SE81 GO:0006954       0.250


## 9. Summary

In [18]:
print("="*60)
print("üìã SUBMISSION GENERATION SUMMARY")
print("="*60)

print(f"\nüîß Configuration:")
print(f"   Model: KNN with ESM-2 embeddings")
print(f"   Thresholds: MF={THRESHOLDS['MF']}, BP={THRESHOLDS['BP']}, CC={THRESHOLDS['CC']}")
print(f"   Max terms/protein: {MAX_TERMS_PER_PROTEIN}")

if limited_df is not None:
    print(f"\nüìä Submission Stats:")
    print(f"   File: {submission_path.name}")
    print(f"   Total predictions: {len(limited_df):,}")
    print(f"   Proteins covered: {limited_df['EntryID'].nunique():,}")
    
print(f"\nüéØ Expected Validation F1: ~0.2579")
print(f"   (Based on per-aspect CAFA metric with optimal thresholds)")

print("\n" + "="*60)

üìã SUBMISSION GENERATION SUMMARY

üîß Configuration:
   Model: KNN with ESM-2 embeddings
   Thresholds: MF=0.4, BP=0.2, CC=0.4
   Max terms/protein: 1500

üìä Submission Stats:
   File: submission_knn_aspect_20251125_1255.tsv
   Total predictions: 3,153,866
   Proteins covered: 224,309

üéØ Expected Validation F1: ~0.2579
   (Based on per-aspect CAFA metric with optimal thresholds)



---

## Next Steps

1. **If predictions missing:** Run notebook 02 with `GENERATE_TEST_PREDICTIONS=True`
2. **Upload to Kaggle:** Submit the generated `.tsv` file
3. **Improve score:** 
   - Re-evaluate ESM-2 with per-aspect metric
   - Scale to ESM-2 150M/650M
   - Build ensemble (KNN + ESM-2)