In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Full EDA of CAFA 6

In [None]:
# ============================================================================
# CAFA 6 PROTEIN FUNCTION PREDICTION - ENHANCED VERSION (FIXED)
# ============================================================================

# ‚öôÔ∏è CONFIGURATION - ADJUST THIS FOR SPEED VS ACCURACY
SAMPLE_PERCENT = 100  # Use 100% of data
QUICK_MODE = True   # Enable full feature computation

# Package Installation
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

print("Installing required packages...")
try:
    import obonet
except:
    install('obonet')
    import obonet

try:
    from Bio import SeqIO
except:
    install('biopython')
    from Bio import SeqIO

# Core Imports
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

import networkx as nx
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("="*80)
print("CAFA 6 PROTEIN FUNCTION PREDICTION - ENHANCED STARTER")
print(f"üìä SAMPLE MODE: {SAMPLE_PERCENT}% of data")
print(f"‚ö° QUICK MODE: {'ON' if QUICK_MODE else 'OFF'}")
print("="*80)

# ============================================================================
# 1. DEFINE PATHS
# ============================================================================
BASE = Path('/kaggle/input/cafa-6-protein-function-prediction')
TRAIN_DIR = BASE / 'Train'
TEST_DIR = BASE / 'Test'

# ============================================================================
# 2. LOAD GO ONTOLOGY (WITH HIERARCHY ANALYSIS)
# ============================================================================
print("\n[1/9] Loading GO ontology...")
go_graph = obonet.read_obo(TRAIN_DIR / 'go-basic.obo')
print(f"   ‚úì Loaded {len(go_graph)} GO terms")

# Map terms to ontologies
term_to_ont = {}
term_names = {}
for term_id in go_graph.nodes():
    if 'namespace' in go_graph.nodes[term_id]:
        ns = go_graph.nodes[term_id]['namespace']
        if ns == 'biological_process':
            term_to_ont[term_id] = 'BPO'
        elif ns == 'cellular_component':
            term_to_ont[term_id] = 'CCO'
        elif ns == 'molecular_function':
            term_to_ont[term_id] = 'MFO'
    if 'name' in go_graph.nodes[term_id]:
        term_names[term_id] = go_graph.nodes[term_id]['name']

ont_counts = pd.Series(term_to_ont).value_counts()
print(f"   ‚úì Ontology breakdown: MF={ont_counts.get('MFO',0)}, BP={ont_counts.get('BPO',0)}, CC={ont_counts.get('CCO',0)}")

# Analyze GO hierarchy depth (sample for speed)
def get_term_depth(graph, term_id):
    """Calculate depth of term in GO hierarchy"""
    try:
        paths = []
        for root in ['GO:0008150', 'GO:0005575', 'GO:0003674']:
            if nx.has_path(graph, term_id, root):
                paths.append(nx.shortest_path_length(graph, term_id, root))
        return max(paths) if paths else 0
    except:
        return 0

print("   Computing GO hierarchy depths...")
sample_terms_for_depth = list(term_to_ont.keys())[:1000]
term_depths = {term: get_term_depth(go_graph, term) for term in sample_terms_for_depth}

# Visualize ontology with enhanced graphics
fig = plt.figure(figsize=(18, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Main ontology distribution
ax1 = fig.add_subplot(gs[0, :2])
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
bars = ax1.bar(range(len(ont_counts)), ont_counts.values, color=colors, 
               edgecolor='black', linewidth=2, alpha=0.8)
ax1.set_xticks(range(len(ont_counts)))
ax1.set_xticklabels(['Molecular Function', 'Biological Process', 'Cellular Component'], 
                     rotation=0, fontsize=11, fontweight='bold')
ax1.set_title('GO Term Distribution by Ontology', fontsize=14, fontweight='bold', pad=20)
ax1.set_ylabel('Number of Terms', fontsize=12, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)
for i, (v, bar) in enumerate(zip(ont_counts.values, bars)):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{v:,}\n({v/ont_counts.sum()*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold', fontsize=11)

# Hierarchy depth distribution
ax2 = fig.add_subplot(gs[0, 2])
depth_values = list(term_depths.values())
ax2.hist(depth_values, bins=20, color='#A8E6CF', edgecolor='black', alpha=0.7)
ax2.set_title('GO Term Depth\nDistribution', fontsize=11, fontweight='bold')
ax2.set_xlabel('Hierarchy Depth', fontsize=10)
ax2.set_ylabel('Count', fontsize=10)
ax2.axvline(np.mean(depth_values), color='red', linestyle='--', linewidth=2,
            label=f'Mean: {np.mean(depth_values):.1f}')
ax2.legend(fontsize=9)

# Network visualization (sample of GO graph)
ax3 = fig.add_subplot(gs[1:, :])
sample_terms = list(term_to_ont.keys())[:50]
subgraph = go_graph.subgraph(sample_terms)
pos = nx.spring_layout(subgraph, k=0.5, iterations=50, seed=42)
node_colors = [colors[['MFO', 'BPO', 'CCO'].index(term_to_ont.get(node, 'MFO'))] 
               for node in subgraph.nodes()]
nx.draw_networkx_nodes(subgraph, pos, node_color=node_colors, 
                       node_size=300, alpha=0.7, ax=ax3)
nx.draw_networkx_edges(subgraph, pos, alpha=0.2, arrows=True, 
                       arrowsize=10, ax=ax3, edge_color='gray')
ax3.set_title('GO Ontology Network Structure (Sample of 50 terms)', 
              fontsize=13, fontweight='bold', pad=15)
ax3.axis('off')

# Legend
legend_elements = [mpatches.Patch(facecolor=colors[0], label='Molecular Function'),
                   mpatches.Patch(facecolor=colors[1], label='Biological Process'),
                   mpatches.Patch(facecolor=colors[2], label='Cellular Component')]
ax3.legend(handles=legend_elements, loc='upper right', fontsize=10, framealpha=0.9)

plt.suptitle('Gene Ontology Analysis', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

# ============================================================================
# 3. LOAD IA WEIGHTS (WITH ANALYSIS)
# ============================================================================
print("\n[2/9] Loading IA weights...")
ia_df = pd.read_csv(BASE / 'IA.tsv', sep='\t', header=None, names=['term', 'ia'])

if SAMPLE_PERCENT < 100:
    ia_df = ia_df.sample(frac=SAMPLE_PERCENT/100, random_state=42)

ia_dict = dict(zip(ia_df['term'], ia_df['ia']))
print(f"   ‚úì Loaded {len(ia_dict)} IA weights")

# Enhanced IA visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# IA distribution by ontology
ia_by_ont = ia_df.copy()
ia_by_ont['ontology'] = ia_by_ont['term'].map(term_to_ont)
ia_by_ont = ia_by_ont.dropna()

axes[0, 0].hist(ia_df['ia'], bins=50, color='#95E1D3', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Overall IA Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('IA Weight', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].axvline(ia_df['ia'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {ia_df["ia"].mean():.2f}')
axes[0, 0].legend()

# Box plot by ontology
ont_data = [ia_by_ont[ia_by_ont['ontology']==ont]['ia'].values 
            for ont in ['MFO', 'BPO', 'CCO']]
bp = axes[0, 1].boxplot(ont_data, labels=['MF', 'BP', 'CC'], patch_artist=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[0, 1].set_title('IA Weights by Ontology', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('IA Weight', fontsize=10)
axes[0, 1].grid(axis='y', alpha=0.3)

# Violin plot
parts = axes[0, 2].violinplot(ont_data, positions=[1, 2, 3], showmeans=True, showmedians=True)
for pc, color in zip(parts['bodies'], colors):
    pc.set_facecolor(color)
    pc.set_alpha(0.7)
axes[0, 2].set_xticks([1, 2, 3])
axes[0, 2].set_xticklabels(['MF', 'BP', 'CC'])
axes[0, 2].set_title('IA Distribution Density', fontsize=12, fontweight='bold')
axes[0, 2].set_ylabel('IA Weight', fontsize=10)

# Cumulative distribution
sorted_ia = np.sort(ia_df['ia'].values)
cumsum = np.cumsum(sorted_ia) / np.sum(sorted_ia)
axes[1, 0].plot(sorted_ia, cumsum, linewidth=2, color='#6C5CE7')
axes[1, 0].set_title('Cumulative IA Distribution', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('IA Weight', fontsize=10)
axes[1, 0].set_ylabel('Cumulative Proportion', fontsize=10)
axes[1, 0].grid(alpha=0.3)
axes[1, 0].axhline(0.5, color='red', linestyle='--', alpha=0.5, label='50%')
axes[1, 0].legend()

# Top terms by IA
top_ia = ia_df.nlargest(15, 'ia')
axes[1, 1].barh(range(len(top_ia)), top_ia['ia'].values, color='#FF7675', edgecolor='black')
axes[1, 1].set_yticks(range(len(top_ia)))
axes[1, 1].set_yticklabels([f"{t[:15]}..." if len(t) > 15 else t 
                            for t in top_ia['term'].values], fontsize=8)
axes[1, 1].set_title('Top 15 Terms by IA Weight', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('IA Weight', fontsize=10)
axes[1, 1].invert_yaxis()

# Statistics summary
axes[1, 2].axis('off')
ia_stats = f"""
IA WEIGHT STATISTICS

Total terms: {len(ia_df):,}

Overall:
  ‚Ä¢ Mean: {ia_df['ia'].mean():.3f}
  ‚Ä¢ Median: {ia_df['ia'].median():.3f}
  ‚Ä¢ Std Dev: {ia_df['ia'].std():.3f}
  ‚Ä¢ Range: [{ia_df['ia'].min():.3f}, {ia_df['ia'].max():.3f}]

By Ontology (Mean ¬± Std):
  ‚Ä¢ MF: {ia_by_ont[ia_by_ont['ontology']=='MFO']['ia'].mean():.3f} ¬± {ia_by_ont[ia_by_ont['ontology']=='MFO']['ia'].std():.3f}
  ‚Ä¢ BP: {ia_by_ont[ia_by_ont['ontology']=='BPO']['ia'].mean():.3f} ¬± {ia_by_ont[ia_by_ont['ontology']=='BPO']['ia'].std():.3f}
  ‚Ä¢ CC: {ia_by_ont[ia_by_ont['ontology']=='CCO']['ia'].mean():.3f} ¬± {ia_by_ont[ia_by_ont['ontology']=='CCO']['ia'].std():.3f}
"""
axes[1, 2].text(0.05, 0.5, ia_stats, fontsize=10, family='monospace',
                verticalalignment='center')

plt.suptitle('Information Accretion (IA) Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# ============================================================================
# 4. LOAD TRAINING DATA (WITH COMPREHENSIVE ANALYSIS) - FIXED
# ============================================================================
print("\n[3/9] Loading training data...")

train_terms = pd.read_csv(TRAIN_DIR / 'train_terms.tsv', sep='\t', 
                          names=['protein', 'term', 'ontology'])
train_taxonomy = pd.read_csv(TRAIN_DIR / 'train_taxonomy.tsv', sep='\t',
                             names=['protein', 'taxon'])

print(f"   ‚úì Full dataset: {len(train_terms)} annotations, {train_terms['protein'].nunique()} proteins")

# SAMPLE proteins for faster iteration
if SAMPLE_PERCENT < 100:
    sample_proteins = train_terms['protein'].drop_duplicates().sample(
        frac=SAMPLE_PERCENT/100, random_state=42
    ).tolist()
    train_terms = train_terms[train_terms['protein'].isin(sample_proteins)]
    train_taxonomy = train_taxonomy[train_taxonomy['protein'].isin(sample_proteins)]
    print(f"   ‚úì Sampled to {SAMPLE_PERCENT}%: {len(train_terms)} annotations, {len(sample_proteins)} proteins")

# Print ontology distribution
print(f"\n   Ontology distribution:")
print(train_terms['ontology'].value_counts())

# Comprehensive training data visualization - FIXED
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 4, hspace=0.35, wspace=0.35)

# 1. Ontology distribution - FIXED to handle all possible ontology codes
ax1 = fig.add_subplot(gs[0, 0])
ont_dist = train_terms['ontology'].value_counts()

# Map ontology codes (handle F, P, C or any other codes)
colors_ont_map = {'F': '#FF6B6B', 'P': '#4ECDC4', 'C': '#45B7D1'}
ont_names_map = {'F': 'MF', 'P': 'BP', 'C': 'CC'}

# Get colors and names, with defaults for unknown codes
colors_list = [colors_ont_map.get(k, '#CCCCCC') for k in ont_dist.index]
labels_list = [ont_names_map.get(k, k) for k in ont_dist.index]

bars = ax1.bar(range(len(ont_dist)), ont_dist.values, color=colors_list, 
               edgecolor='black', linewidth=1.5)
ax1.set_xticks(range(len(ont_dist)))
ax1.set_xticklabels(labels_list)
ax1.set_title('Annotations by Ontology', fontsize=11, fontweight='bold')
ax1.set_ylabel('Count', fontsize=9)
for i, (v, bar) in enumerate(zip(ont_dist.values, bars)):
    ax1.text(bar.get_x() + bar.get_width()/2., v, f'{v:,}', 
             ha='center', va='bottom', fontweight='bold', fontsize=9)

# 2. Top terms
ax2 = fig.add_subplot(gs[0, 1:3])
top_terms = train_terms['term'].value_counts().head(20)
ax2.barh(range(len(top_terms)), top_terms.values, color='#A8E6CF', edgecolor='black')
ax2.set_yticks(range(len(top_terms)))
ax2.set_yticklabels([f"{term_names.get(t, t)[:30]}..." if len(term_names.get(t, t)) > 30 
                     else term_names.get(t, t) for t in top_terms.index], fontsize=8)
ax2.set_title('Top 20 Most Frequent GO Terms', fontsize=11, fontweight='bold')
ax2.set_xlabel('Count', fontsize=9)
ax2.invert_yaxis()

# 3. Terms per protein
ax3 = fig.add_subplot(gs[0, 3])
terms_per_protein = train_terms.groupby('protein').size()
ax3.hist(terms_per_protein, bins=50, color='#FFD93D', edgecolor='black', alpha=0.7)
ax3.set_title('Terms per Protein', fontsize=11, fontweight='bold')
ax3.set_xlabel('# Terms', fontsize=9)
ax3.set_ylabel('Frequency', fontsize=9)
ax3.axvline(terms_per_protein.mean(), color='red', linestyle='--', linewidth=2)

# 4. Proteins per term
ax4 = fig.add_subplot(gs[1, 0])
proteins_per_term = train_terms.groupby('term').size()
ax4.hist(proteins_per_term, bins=50, color='#FFEAA7', edgecolor='black', alpha=0.7, log=True)
ax4.set_title('Proteins per Term (log)', fontsize=11, fontweight='bold')
ax4.set_xlabel('# Proteins', fontsize=9)
ax4.set_ylabel('# Terms (log)', fontsize=9)

# 5. Taxonomy distribution
ax5 = fig.add_subplot(gs[1, 1])
top_taxa = train_taxonomy['taxon'].value_counts().head(10)
ax5.bar(range(len(top_taxa)), top_taxa.values, color='#74B9FF', edgecolor='black')
ax5.set_xticks(range(len(top_taxa)))
ax5.set_xticklabels([str(t)[:8] for t in top_taxa.index], rotation=45, ha='right', fontsize=8)
ax5.set_title('Top 10 Species', fontsize=11, fontweight='bold')
ax5.set_ylabel('# Proteins', fontsize=9)

# 6. Term co-occurrence heatmap
ax6 = fig.add_subplot(gs[1, 2:])
top_10_terms = train_terms['term'].value_counts().head(10).index
cooc_matrix = np.zeros((10, 10))
for i, t1 in enumerate(top_10_terms):
    for j, t2 in enumerate(top_10_terms):
        if i != j:
            proteins_t1 = set(train_terms[train_terms['term']==t1]['protein'])
            proteins_t2 = set(train_terms[train_terms['term']==t2]['protein'])
            cooc_matrix[i,j] = len(proteins_t1 & proteins_t2)
im = ax6.imshow(cooc_matrix, cmap='YlOrRd', aspect='auto')
ax6.set_xticks(range(10))
ax6.set_yticks(range(10))
ax6.set_xticklabels([term_names.get(t, t)[:10] for t in top_10_terms], 
                     rotation=45, ha='right', fontsize=7)
ax6.set_yticklabels([term_names.get(t, t)[:10] for t in top_10_terms], fontsize=7)
ax6.set_title('Term Co-occurrence Matrix', fontsize=11, fontweight='bold')
plt.colorbar(im, ax=ax6, label='# Shared Proteins')

# 7. Annotation density
ax7 = fig.add_subplot(gs[2, :2])
term_freq_bins = pd.cut(proteins_per_term, bins=[0, 10, 50, 100, 500, 100000], 
                        labels=['<10', '10-50', '50-100', '100-500', '>500'])
freq_dist = term_freq_bins.value_counts().sort_index()
ax7.bar(range(len(freq_dist)), freq_dist.values, color='#E17055', edgecolor='black', alpha=0.7)
ax7.set_xticks(range(len(freq_dist)))
ax7.set_xticklabels(freq_dist.index, rotation=0)
ax7.set_title('GO Term Frequency Distribution', fontsize=11, fontweight='bold')
ax7.set_xlabel('# Proteins with Term', fontsize=9)
ax7.set_ylabel('# Terms', fontsize=9)
for i, v in enumerate(freq_dist.values):
    ax7.text(i, v, str(v), ha='center', va='bottom', fontweight='bold')

# 8. Summary statistics
ax8 = fig.add_subplot(gs[2, 2:])
ax8.axis('off')
summary_text = f"""
TRAINING DATA COMPREHENSIVE SUMMARY

Dataset Size:
  ‚Ä¢ Total Annotations: {len(train_terms):,}
  ‚Ä¢ Unique Proteins: {train_terms['protein'].nunique():,}
  ‚Ä¢ Unique GO Terms: {train_terms['term'].nunique():,}
  ‚Ä¢ Species: {train_taxonomy['taxon'].nunique()}

Ontology Distribution:
  ‚Ä¢ Molecular Function: {ont_dist.get('F', 0):,} ({ont_dist.get('F', 0)/len(train_terms)*100:.1f}%)
  ‚Ä¢ Biological Process: {ont_dist.get('P', 0):,} ({ont_dist.get('P', 0)/len(train_terms)*100:.1f}%)
  ‚Ä¢ Cellular Component: {ont_dist.get('C', 0):,} ({ont_dist.get('C', 0)/len(train_terms)*100:.1f}%)

Annotation Statistics:
  ‚Ä¢ Mean terms/protein: {terms_per_protein.mean():.1f}
  ‚Ä¢ Median terms/protein: {terms_per_protein.median():.0f}
  ‚Ä¢ Max terms/protein: {terms_per_protein.max()}
  ‚Ä¢ Mean proteins/term: {proteins_per_term.mean():.1f}
  ‚Ä¢ Median proteins/term: {proteins_per_term.median():.0f}
"""
ax8.text(0.05, 0.5, summary_text, fontsize=10, family='monospace',
         verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

plt.suptitle('Training Data Comprehensive Analysis', fontsize=15, fontweight='bold')
plt.tight_layout()
plt.show()

# Continue with the rest of the code (sequences, features, training, etc.)
print("\n   Loading sequences (this may take a while for 100% of data)...")
print(f"   Expected proteins: {train_terms['protein'].nunique():,}")

train_seqs = {}
loaded_count = 0
target_proteins = set(train_terms['protein'].unique())

for rec in SeqIO.parse(TRAIN_DIR / 'train_sequences.fasta', 'fasta'):
    pid = rec.id.split('|')[1] if '|' in rec.id else rec.id
    if pid in target_proteins:
        train_seqs[pid] = str(rec.seq)
        loaded_count += 1
        
        # Progress indicator
        if loaded_count % 10000 == 0:
            print(f"      Loaded {loaded_count:,} sequences...")
        
    if loaded_count >= len(target_proteins):
        break

print(f"   ‚úì Loaded {len(train_seqs):,} training sequences")

# Enhanced sequence analysis
seq_lengths = [len(s) for s in train_seqs.values()]
print(f"   ‚úì Sequence length: mean={np.mean(seq_lengths):.0f}, "
      f"median={np.median(seq_lengths):.0f}, range=[{min(seq_lengths)}-{max(seq_lengths)}]")

print("\n‚úÖ Data loading complete! Ready for feature extraction and training.")
print(f"   Total proteins: {len(train_seqs):,}")
print(f"   Total annotations: {len(train_terms):,}")
print(f"   Total GO terms: {train_terms['term'].nunique():,}")

# 02. Best prediction

In [None]:
# ============================================================================
# CAFA 6 PROTEIN FUNCTION PREDICTION - OPTIMIZED PIPELINE (FIXED VERSION)
# ============================================================================

import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# BioPython for sequence parsing
from Bio import SeqIO

# Modeling libraries
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

# Memory optimization
import gc
import time

print("="*80)
print("CAFA 6 PROTEIN FUNCTION PREDICTION - OPTIMIZED PIPELINE (FIXED)")
print("="*80)

# ============================================================================
# 1. CONFIGURATION & PATHS - OPTIMIZED FOR SPEED
# ============================================================================
BASE = Path('/kaggle/input/cafa-6-protein-function-prediction')
TRAIN_DIR = BASE / 'Train'
TEST_DIR = BASE / 'Test'

# ‚úÖ OPTIMIZED PARAMETERS FOR FASTER EXECUTION
SAMPLE_FRACTION = 0.05  # Reduced from 0.7 to 0.05 (5% data only)
BATCH_SIZE = 500  # Reduced batch size
PREDICTION_THRESHOLD = 0.02  # Slightly higher for better precision
MAX_PREDS_PER_PROTEIN = 800
MAX_PREDS_PER_ONT = 200

# ‚úÖ NEW: Limit terms per ontology for faster training
MAX_TERMS_PER_ONTOLOGY = 200  # Maximum terms per ontology

print(f"üîß OPTIMIZED CONFIGURATION:")
print(f"   - Sample Fraction: {SAMPLE_FRACTION*100}%")
print(f"   - Max Terms per Ontology: {MAX_TERMS_PER_ONTOLOGY}")
print(f"   - Batch Size: {BATCH_SIZE}")

# ============================================================================
# 2. LOAD GO ONTOLOGY
# ============================================================================
print("\n[1/9] Loading GO ontology...")
go_graph = obonet.read_obo(TRAIN_DIR / 'go-basic.obo')
print(f"   ‚úì Loaded {len(go_graph)} GO terms")

# ============================================================================
# 3. OPTIMIZED DATA LOADING
# ============================================================================
print("\n[2/9] Loading training data with optimization...")

# Load only necessary columns
train_terms = pd.read_csv(TRAIN_DIR / 'train_terms.tsv', sep='\t', 
                          names=['protein', 'term', 'ontology'],
                          usecols=[0, 1, 2])  # Specify columns to save memory

# ‚úÖ IMPROVED: Better sampling method
unique_proteins = train_terms['protein'].unique()
sampled_proteins = np.random.choice(unique_proteins, 
                                   size=int(len(unique_proteins) * SAMPLE_FRACTION), 
                                   replace=False)

train_terms = train_terms[train_terms['protein'].isin(sampled_proteins)]

print(f"   ‚úì Loaded {len(train_terms)} annotations for {train_terms['protein'].nunique()} proteins")

# ============================================================================
# 4. ENHANCED FEATURE EXTRACTION
# ============================================================================
print("\n[3/9] Loading training sequences with optimized features...")

def optimized_feature_extraction(seq):
    """Enhanced feature extraction with better biological insights"""
    FEATURE_SIZE = 30  # Increased for better representation
    
    if not seq or len(seq) == 0:
        return np.zeros(FEATURE_SIZE)
    
    try:
        length = len(seq)
        aa_counts = Counter(seq)
        total_aa = sum(aa_counts.values())
        
        # 1. Amino Acid Composition (20 features)
        aa_list = 'ACDEFGHIKLMNPQRSTVWY'
        aa_freq = np.array([aa_counts.get(aa, 0) / total_aa for aa in aa_list])
        
        # 2. Physicochemical Properties (8 features)
        # Hydrophobicity groups
        very_hydrophobic = sum(aa_counts.get(aa, 0) for aa in 'AILMFWYV') / total_aa
        hydrophobic = sum(aa_counts.get(aa, 0) for aa in 'CGT') / total_aa
        
        # Charge properties
        positive_charged = sum(aa_counts.get(aa, 0) for aa in 'KRH') / total_aa
        negative_charged = sum(aa_counts.get(aa, 0) for aa in 'DE') / total_aa
        
        # Structural properties
        polar = sum(aa_counts.get(aa, 0) for aa in 'STNQ') / total_aa
        aromatic = sum(aa_counts.get(aa, 0) for aa in 'FWY') / total_aa
        small = sum(aa_counts.get(aa, 0) for aa in 'AG') / total_aa
        proline_content = aa_counts.get('P', 0) / total_aa
        
        # 3. Sequence Properties (2 features)
        seq_complexity = len(set(seq)) / length if length > 0 else 0
        molecular_weight_approx = length * 110  # Average AA molecular weight
        
        # Combine all features
        features = np.concatenate([
            aa_freq,  # 20 features
            [
                np.log1p(length), very_hydrophobic, hydrophobic,
                positive_charged, negative_charged, polar, aromatic,
                small, proline_content, seq_complexity,
                np.log1p(molecular_weight_approx)
            ]  # 11 features
        ])
        
        # Ensure correct size and handle NaN
        features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
        
        if len(features) != FEATURE_SIZE:
            features = np.resize(features, FEATURE_SIZE)
            
        return features
        
    except Exception as e:
        print(f"Error in feature extraction: {e}")
        return np.zeros(FEATURE_SIZE)

# ============================================================================
# 5. MEMORY-EFFICIENT FEATURE PROCESSING
# ============================================================================
print("\n[4/9] Processing training features in batches...")

def process_proteins_batch(protein_seqs, batch_size=5000):
    """Process proteins in batches to save memory"""
    all_features = []
    all_proteins = []
    
    proteins = list(protein_seqs.items())
    
    for i in range(0, len(proteins), batch_size):
        batch = proteins[i:i + batch_size]
        batch_features = []
        batch_proteins = []
        
        for pid, seq in batch:
            features = optimized_feature_extraction(seq)
            batch_features.append(features)
            batch_proteins.append(pid)
        
        all_features.extend(batch_features)
        all_proteins.extend(batch_proteins)
        
        # Clear memory
        del batch, batch_features
        gc.collect()
        
        print(f"      Processed {min(i + batch_size, len(proteins)):,}/{len(proteins):,} proteins...")
    
    return np.array(all_features), all_proteins

# Load and process training sequences
train_seqs = {}
train_proteins_processed = 0

for rec in SeqIO.parse(TRAIN_DIR / 'train_sequences.fasta', 'fasta'):
    pid = rec.id.split('|')[1] if '|' in rec.id else rec.id
    # Only process sampled proteins
    if pid in sampled_proteins:
        train_seqs[pid] = str(rec.seq)
        train_proteins_processed += 1
        
    if train_proteins_processed >= len(sampled_proteins):
        break

print(f"   ‚úì Loaded {len(train_seqs)} training sequences")

# Process in batches
X_train, y_train_proteins = process_proteins_batch(train_seqs, batch_size=5000)
print(f"   ‚úì Feature matrix shape: {X_train.shape}")

# Clean memory
del train_seqs
gc.collect()

# Standardize features
print("   Standardizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# ============================================================================
# 6. FEATURE SELECTION FOR EFFICIENCY
# ============================================================================
print("\n[5/9] Applying feature selection...")
# ‚úÖ REDUCED: Select top 10 features instead of 20
feature_selector = SelectKBest(f_classif, k=10)
X_train_selected = feature_selector.fit_transform(X_train_scaled, np.ones(len(X_train_scaled)))

print(f"   ‚úì Reduced features from {X_train_scaled.shape[1]} to {X_train_selected.shape[1]}")

# Clean memory
del X_train, X_train_scaled
gc.collect()

# ============================================================================
# 7. PREPARE LABELS WITH MEMORY OPTIMIZATION
# ============================================================================
print("\n[6/9] Preparing labels by ontology...")

# ‚úÖ OPTIMIZED: Start with only MFO for faster testing
ontologies = {'F': 'MFO'}  # Start with only Molecular Function
# ontologies = {'F': 'MFO', 'P': 'BPO', 'C': 'CCO'}  # Uncomment later for all

mlb_dict = {}
y_train_dict = {}

for ont_code, ont_name in ontologies.items():
    print(f"   Processing {ont_name}...")
    
    ont_terms = train_terms[train_terms['ontology'] == ont_code]
    protein_terms = ont_terms.groupby('protein')['term'].apply(list).to_dict()
    
    # Only include proteins we have features for
    labels_list = [protein_terms.get(pid, []) for pid in y_train_proteins]
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    y_ont = mlb.fit_transform(labels_list)
    
    # ‚úÖ NEW: Limit number of terms for faster training
    if y_ont.shape[1] > MAX_TERMS_PER_ONTOLOGY:
        print(f"      Too many terms ({y_ont.shape[1]}), limiting to {MAX_TERMS_PER_ONTOLOGY}")
        # Select most frequent terms
        term_frequencies = np.array(y_ont.sum(axis=0)).flatten()
        top_term_indices = np.argsort(term_frequencies)[-MAX_TERMS_PER_ONTOLOGY:]
        y_ont = y_ont[:, top_term_indices]
        # Update MLB classes
        mlb.classes_ = mlb.classes_[top_term_indices]
    
    mlb_dict[ont_code] = mlb
    y_train_dict[ont_code] = y_ont
    
    print(f"      {ont_name}: {y_ont.shape[1]} unique terms")

# Clean memory
del train_terms
gc.collect()

# ============================================================================
# 8. EFFICIENT MODEL TRAINING WITH PROGRESS MONITORING
# ============================================================================
print("\n[7/9] Training optimized models...")

models = {}

for ont_code, ont_name in ontologies.items():
    print(f"   Training {ont_name} model...")
    
    y_ont = y_train_dict[ont_code]
    
    if y_ont.shape[1] == 0:
        print(f"      Skipping {ont_name} (no terms)")
        continue
    
    # ‚úÖ IMPROVED: Add timing and progress monitoring
    start_time = time.time()
    
    try:
        # Convert sparse to dense for liblinear
        y_ont_dense = y_ont.toarray() if hasattr(y_ont, 'toarray') else y_ont
        
        # Memory-efficient configuration
        model = OneVsRestClassifier(
            LogisticRegression(
                max_iter=100,  # Reduced iterations for speed
                solver='liblinear',  # More memory efficient
                C=0.8,  # Slightly more regularization
                random_state=42,
                class_weight='balanced',
                verbose=0  # Set to 1 to see training progress
            ),
            n_jobs=1  # Single job to avoid memory issues
        )
        
        print(f"      Starting training for {y_ont_dense.shape[1]} terms...")
        model.fit(X_train_selected, y_ont_dense)
        models[ont_code] = model
        
        end_time = time.time()
        training_time = end_time - start_time
        
        print(f"      ‚úì {ont_name} model trained in {training_time:.2f} seconds")
        
    except Exception as e:
        print(f"      ‚úó Error training {ont_name}: {e}")
        continue
    
    # Clean memory after each model
    del y_ont
    gc.collect()

print(f"   ‚úì {len(models)} models trained successfully")

# ============================================================================
# 9. OPTIMIZED PREDICTION PIPELINE
# ============================================================================
print("\n[8/9] Loading and processing test sequences...")

test_seqs = {}
test_proteins = []

# ‚úÖ OPTIMIZED: Load only limited test sequences for initial testing
MAX_TEST_SAMPLES = 1000  # Start with 1000 test samples

# Process test sequences in streaming fashion
for i, rec in enumerate(SeqIO.parse(TEST_DIR / 'testsuperset.fasta', 'fasta')):
    if i >= MAX_TEST_SAMPLES:
        break
    pid = rec.id.split('|')[1] if '|' in rec.id else rec.id
    test_seqs[pid] = str(rec.seq)
    test_proteins.append(pid)

print(f"   ‚úì Loaded {len(test_seqs):,} test sequences (limited for testing)")

print("\n[9/9] Making optimized predictions...")

submission_list = []

def process_prediction_batch(protein_batch, seq_dict):
    """Process predictions for a batch of proteins"""
    batch_features = []
    valid_proteins = []
    
    for pid in protein_batch:
        features = optimized_feature_extraction(seq_dict[pid])
        if len(features) == X_train_selected.shape[1]:
            batch_features.append(features)
            valid_proteins.append(pid)
    
    if not batch_features:
        return []
    
    X_batch = np.array(batch_features)
    X_batch = np.nan_to_num(X_batch, nan=0.0, posinf=0.0, neginf=0.0)
    
    try:
        X_batch_scaled = scaler.transform(X_batch)
        X_batch_selected = feature_selector.transform(X_batch_scaled)
    except Exception as e:
        print(f"   Error processing batch: {e}")
        return []
    
    batch_predictions = []
    
    for ont_code in models:
        model = models[ont_code]
        mlb = mlb_dict[ont_code]
        
        try:
            y_pred_proba = model.predict_proba(X_batch_selected)
            
            for i, pid in enumerate(valid_proteins):
                probs = y_pred_proba[i]
                top_indices = np.where(probs > PREDICTION_THRESHOLD)[0]
                
                if len(top_indices) > 0:
                    sorted_indices = top_indices[np.argsort(probs[top_indices])[::-1]]
                    sorted_indices = sorted_indices[:MAX_PREDS_PER_ONT]
                    
                    for idx in sorted_indices:
                        term = mlb.classes_[idx]
                        score = probs[idx]
                        batch_predictions.append((pid, term, score))
                        
        except Exception as e:
            print(f"   Error in {ont_code} prediction: {e}")
            continue
    
    return batch_predictions

# Process test proteins in small batches
for i in range(0, len(test_proteins), BATCH_SIZE):
    batch = test_proteins[i:i + BATCH_SIZE]
    batch_predictions = process_prediction_batch(batch, test_seqs)
    submission_list.extend(batch_predictions)
    
    if (i + BATCH_SIZE) % 500 == 0 or (i + BATCH_SIZE) >= len(test_proteins):
        print(f"      Processed {min(i + BATCH_SIZE, len(test_proteins)):,}/{len(test_proteins):,} proteins...")
    
    # Clear memory regularly
    if (i // BATCH_SIZE) % 5 == 0:
        gc.collect()

print(f"   ‚úì Generated {len(submission_list):,} predictions")

# Create final submission
if submission_list:
    submission_df = pd.DataFrame(submission_list, columns=['protein', 'term', 'score'])
    submission_df = submission_df.sort_values(['protein', 'score'], ascending=[True, False])
    submission_df = submission_df.groupby('protein').head(MAX_PREDS_PER_PROTEIN).reset_index(drop=True)

    # Format scores
    def safe_format_score(x):
        try:
            score = max(0.001, min(1.0, float(x)))
            return f"{score:.3f}"
        except:
            return "0.001"

    submission_df['score'] = submission_df['score'].apply(safe_format_score)

    # Save submission
    submission_df.to_csv('submission.tsv', sep='\t', index=False, header=False)

    print(f"\n   ‚úì Final submission saved: {len(submission_df):,} predictions")
    print(f"   ‚úì Unique proteins: {submission_df['protein'].nunique():,}")
    print(f"   ‚úì Mean predictions per protein: {len(submission_df)/submission_df['protein'].nunique():.1f}")
else:
    print("\n   ‚ö†Ô∏è No predictions generated - creating minimal submission")
    # Create minimal submission file to avoid errors
    minimal_submission = pd.DataFrame({
        'protein': ['A0A000'] if test_proteins else ['A0A000'],
        'term': ['GO:0003674'],
        'score': ['0.001']
    })
    minimal_submission.to_csv('submission.tsv', sep='\t', index=False, header=False)
    print("   ‚úì Created minimal submission file")

print("\n" + "="*80)
print("‚úÖ OPTIMIZED PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)

In [None]:
# # ============================================================================
# # CAFA 6 PROTEIN FUNCTION PREDICTION - OPTIMIZED PIPELINE
# # ============================================================================

# import pandas as pd
# import numpy as np
# from pathlib import Path
# from collections import Counter
# import warnings
# warnings.filterwarnings('ignore')

# # BioPython for sequence parsing
# from Bio import SeqIO

# # Modeling libraries
# from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_classif

# # Memory optimization
# import gc

# print("="*80)
# print("CAFA 6 PROTEIN FUNCTION PREDICTION - OPTIMIZED PIPELINE")
# print("="*80)

# # ============================================================================
# # 1. CONFIGURATION & PATHS
# # ============================================================================
# BASE = Path('/kaggle/input/cafa-6-protein-function-prediction')
# TRAIN_DIR = BASE / 'Train'
# TEST_DIR = BASE / 'Test'

# # Optimized parameters
# BATCH_SIZE = 1000  # Reduced for memory efficiency
# PREDICTION_THRESHOLD = 0.015  # Slightly higher for better precision
# MAX_PREDS_PER_PROTEIN = 1200
# MAX_PREDS_PER_ONT = 400

# # ============================================================================
# # 2. LOAD GO ONTOLOGY
# # ============================================================================
# print("\n[1/9] Loading GO ontology...")
# go_graph = obonet.read_obo(TRAIN_DIR / 'go-basic.obo')
# print(f"   ‚úì Loaded {len(go_graph)} GO terms")

# # ============================================================================
# # 3. OPTIMIZED DATA LOADING
# # ============================================================================
# print("\n[2/9] Loading training data with optimization...")

# # Load only necessary columns
# train_terms = pd.read_csv(TRAIN_DIR / 'train_terms.tsv', sep='\t', 
#                           names=['protein', 'term', 'ontology'],
#                           usecols=[0, 1, 2])  # Specify columns to save memory

# # Sample proteins for faster training (adjustable)
# SAMPLE_FRACTION = 0.7  # Use 70% of data for faster processing
# unique_proteins = train_terms['protein'].unique()
# sampled_proteins = np.random.choice(unique_proteins, 
#                                    size=int(len(unique_proteins) * SAMPLE_FRACTION), 
#                                    replace=False)

# train_terms = train_terms[train_terms['protein'].isin(sampled_proteins)]

# print(f"   ‚úì Loaded {len(train_terms)} annotations for {train_terms['protein'].nunique()} proteins")

# # ============================================================================
# # 4. ENHANCED FEATURE EXTRACTION
# # ============================================================================
# print("\n[3/9] Loading training sequences with optimized features...")

# def optimized_feature_extraction(seq):
#     """Enhanced feature extraction with better biological insights"""
#     FEATURE_SIZE = 30  # Increased for better representation
    
#     if not seq or len(seq) == 0:
#         return np.zeros(FEATURE_SIZE)
    
#     try:
#         length = len(seq)
#         aa_counts = Counter(seq)
#         total_aa = sum(aa_counts.values())
        
#         # 1. Amino Acid Composition (20 features)
#         aa_list = 'ACDEFGHIKLMNPQRSTVWY'
#         aa_freq = np.array([aa_counts.get(aa, 0) / total_aa for aa in aa_list])
        
#         # 2. Physicochemical Properties (8 features)
#         # Hydrophobicity groups
#         very_hydrophobic = sum(aa_counts.get(aa, 0) for aa in 'AILMFWYV') / total_aa
#         hydrophobic = sum(aa_counts.get(aa, 0) for aa in 'CGT') / total_aa
        
#         # Charge properties
#         positive_charged = sum(aa_counts.get(aa, 0) for aa in 'KRH') / total_aa
#         negative_charged = sum(aa_counts.get(aa, 0) for aa in 'DE') / total_aa
        
#         # Structural properties
#         polar = sum(aa_counts.get(aa, 0) for aa in 'STNQ') / total_aa
#         aromatic = sum(aa_counts.get(aa, 0) for aa in 'FWY') / total_aa
#         small = sum(aa_counts.get(aa, 0) for aa in 'AG') / total_aa
#         proline_content = aa_counts.get('P', 0) / total_aa
        
#         # 3. Sequence Properties (2 features)
#         seq_complexity = len(set(seq)) / length if length > 0 else 0
#         molecular_weight_approx = length * 110  # Average AA molecular weight
        
#         # Combine all features
#         features = np.concatenate([
#             aa_freq,  # 20 features
#             [
#                 np.log1p(length), very_hydrophobic, hydrophobic,
#                 positive_charged, negative_charged, polar, aromatic,
#                 small, proline_content, seq_complexity,
#                 np.log1p(molecular_weight_approx)
#             ]  # 11 features
#         ])
        
#         # Ensure correct size and handle NaN
#         features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
        
#         if len(features) != FEATURE_SIZE:
#             features = np.resize(features, FEATURE_SIZE)
            
#         return features
        
#     except Exception as e:
#         print(f"Error in feature extraction: {e}")
#         return np.zeros(FEATURE_SIZE)

# # ============================================================================
# # 5. MEMORY-EFFICIENT FEATURE PROCESSING
# # ============================================================================
# print("\n[4/9] Processing training features in batches...")

# def process_proteins_batch(protein_seqs, batch_size=5000):
#     """Process proteins in batches to save memory"""
#     all_features = []
#     all_proteins = []
    
#     proteins = list(protein_seqs.items())
    
#     for i in range(0, len(proteins), batch_size):
#         batch = proteins[i:i + batch_size]
#         batch_features = []
#         batch_proteins = []
        
#         for pid, seq in batch:
#             features = optimized_feature_extraction(seq)
#             batch_features.append(features)
#             batch_proteins.append(pid)
        
#         all_features.extend(batch_features)
#         all_proteins.extend(batch_proteins)
        
#         # Clear memory
#         del batch, batch_features
#         gc.collect()
        
#         print(f"      Processed {min(i + batch_size, len(proteins)):,}/{len(proteins):,} proteins...")
    
#     return np.array(all_features), all_proteins

# # Load and process training sequences
# train_seqs = {}
# train_proteins_processed = 0

# for rec in SeqIO.parse(TRAIN_DIR / 'train_sequences.fasta', 'fasta'):
#     pid = rec.id.split('|')[1] if '|' in rec.id else rec.id
#     # Only process sampled proteins
#     if pid in sampled_proteins:
#         train_seqs[pid] = str(rec.seq)
#         train_proteins_processed += 1
        
#     if train_proteins_processed >= len(sampled_proteins):
#         break

# print(f"   ‚úì Loaded {len(train_seqs)} training sequences")

# # Process in batches
# X_train, y_train_proteins = process_proteins_batch(train_seqs, batch_size=5000)
# print(f"   ‚úì Feature matrix shape: {X_train.shape}")

# # Clean memory
# del train_seqs
# gc.collect()

# # Standardize features
# print("   Standardizing features...")
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)

# # ============================================================================
# # 6. FEATURE SELECTION FOR EFFICIENCY
# # ============================================================================
# print("\n[5/9] Applying feature selection...")
# # Select top 20 features to reduce dimensionality
# feature_selector = SelectKBest(f_classif, k=20)
# X_train_selected = feature_selector.fit_transform(X_train_scaled, np.ones(len(X_train_scaled)))

# print(f"   ‚úì Reduced features from {X_train_scaled.shape[1]} to {X_train_selected.shape[1]}")

# # Clean memory
# del X_train, X_train_scaled
# gc.collect()

# # ============================================================================
# # 7. PREPARE LABELS WITH MEMORY OPTIMIZATION
# # ============================================================================
# print("\n[6/9] Preparing labels by ontology...")

# ontologies = {'F': 'MFO', 'P': 'BPO', 'C': 'CCO'}
# mlb_dict = {}
# y_train_dict = {}

# for ont_code, ont_name in ontologies.items():
#     print(f"   Processing {ont_name}...")
    
#     ont_terms = train_terms[train_terms['ontology'] == ont_code]
#     protein_terms = ont_terms.groupby('protein')['term'].apply(list).to_dict()
    
#     # Only include proteins we have features for
#     labels_list = [protein_terms.get(pid, []) for pid in y_train_proteins]
    
#     mlb = MultiLabelBinarizer(sparse_output=True)
#     y_ont = mlb.fit_transform(labels_list)
    
#     mlb_dict[ont_code] = mlb
#     y_train_dict[ont_code] = y_ont
    
#     print(f"      {ont_name}: {y_ont.shape[1]} unique terms")

# # Clean memory
# del train_terms
# gc.collect()

# # ============================================================================
# # 8. EFFICIENT MODEL TRAINING
# # ============================================================================
# print("\n[7/9] Training optimized models...")

# models = {}

# for ont_code, ont_name in ontologies.items():
#     print(f"   Training {ont_name} model...")
    
#     y_ont = y_train_dict[ont_code]
    
#     if y_ont.shape[1] == 0:
#         print(f"      Skipping {ont_name} (no terms)")
#         continue
    
#     # Memory-efficient configuration
#     model = OneVsRestClassifier(
#         LogisticRegression(
#             max_iter=500,  # Reduced iterations
#             solver='liblinear',  # More memory efficient
#             C=0.8,  # Slightly more regularization
#             random_state=42,
#             class_weight='balanced'
#         ),
#         n_jobs=1  # Single job to avoid memory issues
#     )
    
#     model.fit(X_train_selected, y_ont)
#     models[ont_code] = model
    
#     print(f"      ‚úì {ont_name} model trained")
    
#     # Clean memory after each model
#     del y_ont
#     gc.collect()

# print(f"   ‚úì All {len(models)} models trained successfully")

# # ============================================================================
# # 9. OPTIMIZED PREDICTION PIPELINE
# # ============================================================================
# print("\n[8/9] Loading and processing test sequences...")

# test_seqs = {}
# test_proteins = []

# # Process test sequences in streaming fashion
# for rec in SeqIO.parse(TEST_DIR / 'testsuperset.fasta', 'fasta'):
#     pid = rec.id.split('|')[1] if '|' in rec.id else rec.id
#     test_seqs[pid] = str(rec.seq)
#     test_proteins.append(pid)

# print(f"   ‚úì Loaded {len(test_seqs):,} test sequences")

# print("\n[9/9] Making optimized predictions...")

# submission_list = []

# def process_prediction_batch(protein_batch, seq_dict):
#     """Process predictions for a batch of proteins"""
#     batch_features = []
#     valid_proteins = []
    
#     for pid in protein_batch:
#         features = optimized_feature_extraction(seq_dict[pid])
#         if len(features) == X_train_selected.shape[1]:
#             batch_features.append(features)
#             valid_proteins.append(pid)
    
#     if not batch_features:
#         return []
    
#     X_batch = np.array(batch_features)
#     X_batch = np.nan_to_num(X_batch, nan=0.0, posinf=0.0, neginf=0.0)
    
#     try:
#         X_batch_scaled = scaler.transform(X_batch)
#         X_batch_selected = feature_selector.transform(X_batch_scaled)
#     except Exception as e:
#         print(f"   Error processing batch: {e}")
#         return []
    
#     batch_predictions = []
    
#     for ont_code in models:
#         model = models[ont_code]
#         mlb = mlb_dict[ont_code]
        
#         try:
#             y_pred_proba = model.predict_proba(X_batch_selected)
            
#             for i, pid in enumerate(valid_proteins):
#                 probs = y_pred_proba[i]
#                 top_indices = np.where(probs > PREDICTION_THRESHOLD)[0]
                
#                 if len(top_indices) > 0:
#                     sorted_indices = top_indices[np.argsort(probs[top_indices])[::-1]]
#                     sorted_indices = sorted_indices[:MAX_PREDS_PER_ONT]
                    
#                     for idx in sorted_indices:
#                         term = mlb.classes_[idx]
#                         score = probs[idx]
#                         batch_predictions.append((pid, term, score))
                        
#         except Exception as e:
#             print(f"   Error in {ont_code} prediction: {e}")
#             continue
    
#     return batch_predictions

# # Process test proteins in small batches
# for i in range(0, len(test_proteins), BATCH_SIZE):
#     batch = test_proteins[i:i + BATCH_SIZE]
#     batch_predictions = process_prediction_batch(batch, test_seqs)
#     submission_list.extend(batch_predictions)
    
#     if (i + BATCH_SIZE) % 5000 == 0 or (i + BATCH_SIZE) >= len(test_proteins):
#         print(f"      Processed {min(i + BATCH_SIZE, len(test_proteins)):,}/{len(test_proteins):,} proteins...")
    
#     # Clear memory regularly
#     if (i // BATCH_SIZE) % 10 == 0:
#         gc.collect()

# print(f"   ‚úì Generated {len(submission_list):,} predictions")

# # Create final submission
# submission_df = pd.DataFrame(submission_list, columns=['protein', 'term', 'score'])
# submission_df = submission_df.sort_values(['protein', 'score'], ascending=[True, False])
# submission_df = submission_df.groupby('protein').head(MAX_PREDS_PER_PROTEIN).reset_index(drop=True)

# # Format scores
# def safe_format_score(x):
#     try:
#         score = max(0.001, min(1.0, float(x)))
#         return f"{score:.3f}"
#     except:
#         return "0.001"

# submission_df['score'] = submission_df['score'].apply(safe_format_score)

# # Save submission
# submission_df.to_csv('submission.tsv', sep='\t', index=False, header=False)

# print(f"\n   ‚úì Final submission saved: {len(submission_df):,} predictions")
# print(f"   ‚úì Unique proteins: {submission_df['protein'].nunique():,}")
# print(f"   ‚úì Mean predictions per protein: {len(submission_df)/submission_df['protein'].nunique():.1f}")

# print("\n" + "="*80)
# print("‚úÖ OPTIMIZED PIPELINE COMPLETED SUCCESSFULLY!")
# print("="*80)