In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')


# CONFIGURATION

DATA_PATH = Path('/kaggle/input/cafa-6-protein-function-prediction')
MAX_SEQ_LENGTH = 1000  # Truncate long sequences
KMER_SIZE = 3  # For k-mer features
TOP_N_TERMS = 1000  # Limit to most frequent GO terms for faster training


# 1. DATA LOADING

print("Loading data...")

def parse_fasta(fasta_file):
    """Parse FASTA file and return dictionary of sequences"""
    sequences = {}
    current_id = None
    current_seq = []
    
    with open(fasta_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id:
                    sequences[current_id] = ''.join(current_seq)
                # Extract protein ID from header (e.g., sp|P9WHI7|RECN_MYCT)
                parts = line[1:].split('|')
                if len(parts) >= 2:
                    current_id = parts[1]
                else:
                    current_id = line[1:].split()[0]
                current_seq = []
            else:
                current_seq.append(line)
        
        if current_id:
            sequences[current_id] = ''.join(current_seq)
    
    return sequences

# Load sequences
train_sequences = parse_fasta(DATA_PATH / 'Train/train_sequences.fasta')
test_sequences = parse_fasta(DATA_PATH / 'Test/testsuperset.fasta')

print(f"Loaded {len(train_sequences)} training sequences")
print(f"Loaded {len(test_sequences)} test sequences")

# Load annotations
train_terms_df = pd.read_csv(DATA_PATH / 'Train/train_terms.tsv', sep='\t', 
                             names=['protein_id', 'go_term', 'ontology'])

# Load IA weights
ia_df = pd.read_csv(DATA_PATH / 'IA.tsv', sep='\t', 
                    names=['go_term', 'ia_weight'])
ia_weights = dict(zip(ia_df['go_term'], ia_df['ia_weight']))

print(f"Loaded {len(train_terms_df)} annotations")
print(f"Unique GO terms: {train_terms_df['go_term'].nunique()}")


# 2. FEATURE ENGINEERING

print("\nGenerating features...")

def get_amino_acid_composition(seq):
    """Calculate amino acid composition"""
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    total = len(seq)
    if total == 0:
        return np.zeros(20)
    
    composition = np.array([seq.count(aa) / total for aa in amino_acids])
    return composition

def get_dipeptide_composition(seq):
    """Calculate dipeptide composition"""
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    dipeptides = [aa1 + aa2 for aa1 in amino_acids for aa2 in amino_acids]
    total = len(seq) - 1
    if total <= 0:
        return np.zeros(400)
    
    composition = np.array([seq.count(dp) / total for dp in dipeptides])
    return composition

def get_kmer_features(seq, k=3, max_features=100):
    """Get k-mer frequency features (limited for efficiency)"""
    if len(seq) < k:
        return np.zeros(max_features)
    
    kmers = [seq[i:i+k] for i in range(len(seq) - k + 1)]
    kmer_counts = Counter(kmers)
    
    # Get top k-mers
    top_kmers = [kmer for kmer, _ in kmer_counts.most_common(max_features)]
    features = np.array([kmer_counts.get(kmer, 0) / len(kmers) for kmer in top_kmers])
    
    # Pad if needed
    if len(features) < max_features:
        features = np.pad(features, (0, max_features - len(features)))
    
    return features

def get_sequence_features(seq):
    """Extract comprehensive sequence features"""
    seq = seq[:MAX_SEQ_LENGTH]  # Truncate if too long
    
    features = []
    
    # Basic properties
    features.append(len(seq))
    features.append(seq.count('C') / len(seq) if len(seq) > 0 else 0)  # Cysteine content
    features.append(seq.count('M') / len(seq) if len(seq) > 0 else 0)  # Methionine content
    
    # Charged residues
    positive = sum(seq.count(aa) for aa in 'RHK')
    negative = sum(seq.count(aa) for aa in 'DE')
    features.append(positive / len(seq) if len(seq) > 0 else 0)
    features.append(negative / len(seq) if len(seq) > 0 else 0)
    
    # Hydrophobicity
    hydrophobic = sum(seq.count(aa) for aa in 'AILMFWYV')
    features.append(hydrophobic / len(seq) if len(seq) > 0 else 0)
    
    # Amino acid composition
    aa_comp = get_amino_acid_composition(seq)
    features.extend(aa_comp)
    
    # Dipeptide composition (sample subset for efficiency)
    dipep_comp = get_dipeptide_composition(seq)
    features.extend(dipep_comp[:100])  # Use first 100 dipeptides
    
    return np.array(features)

# Generate features for training set
print("Extracting training features...")
X_train_list = []
y_train_proteins = []

for protein_id in train_sequences.keys():
    if protein_id in train_terms_df['protein_id'].values:
        X_train_list.append(get_sequence_features(train_sequences[protein_id]))
        y_train_proteins.append(protein_id)

X_train = np.array(X_train_list)
print(f"Training feature matrix shape: {X_train.shape}")


# 3. LABEL PREPARATION

print("\nPreparing labels...")

# Get most frequent GO terms to limit complexity
term_counts = train_terms_df['go_term'].value_counts()
top_terms = term_counts.head(TOP_N_TERMS).index.tolist()

print(f"Using top {len(top_terms)} GO terms")

# Create label matrix
protein_terms = defaultdict(list)
for _, row in train_terms_df.iterrows():
    if row['go_term'] in top_terms:
        protein_terms[row['protein_id']].append(row['go_term'])

y_train_labels = [protein_terms[pid] for pid in y_train_proteins]

# Binarize labels
mlb = MultiLabelBinarizer(classes=top_terms)
y_train = mlb.fit_transform(y_train_labels)

print(f"Label matrix shape: {y_train.shape}")
print(f"Average labels per protein: {y_train.sum(axis=1).mean():.2f}")


# 4. MODEL TRAINING

print("\nTraining model...")

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Use Logistic Regression for efficiency (faster than RF on large datasets)
# For better performance, consider using XGBoost or neural networks
base_model = LogisticRegression(max_iter=100, random_state=42, n_jobs=-1)
model = MultiOutputClassifier(base_model, n_jobs=-1)

# Train on subset for faster execution (remove this for full training)
sample_size = min(10000, len(X_train))
indices = np.random.choice(len(X_train), sample_size, replace=False)
X_train_sample = X_train[indices]
y_train_sample = y_train[indices]

print(f"Training on {sample_size} samples...")
model.fit(X_train_sample, y_train_sample)

print("Training complete!")


# 5. GENERATE PREDICTIONS

print("\nGenerating predictions for test set...")

# Process test sequences in batches
batch_size = 1000
predictions = []

test_protein_ids = list(test_sequences.keys())
n_batches = (len(test_protein_ids) + batch_size - 1) // batch_size

for batch_idx in range(n_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(test_protein_ids))
    batch_ids = test_protein_ids[start_idx:end_idx]
    
    # Extract features
    X_test_batch = np.array([
        get_sequence_features(test_sequences[pid]) 
        for pid in batch_ids
    ])
    
    # Predict probabilities
    y_pred_proba = model.predict_proba(X_test_batch)
    
    # Convert to predictions
    for i, protein_id in enumerate(batch_ids):
        for j, go_term in enumerate(mlb.classes_):
            # Get probability (handle different sklearn versions)
            if isinstance(y_pred_proba[j], np.ndarray):
                prob = y_pred_proba[j][i, 1] if y_pred_proba[j].shape[1] > 1 else y_pred_proba[j][i]
            else:
                prob = y_pred_proba[j]
            
            # Only include predictions above threshold
            if prob > 0.01:
                predictions.append({
                    'protein_id': protein_id,
                    'go_term': go_term,
                    'probability': min(prob, 0.999)  # Cap at 0.999
                })
    
    if (batch_idx + 1) % 10 == 0:
        print(f"Processed {end_idx}/{len(test_protein_ids)} proteins...")

# 6. CREATE SUBMISSION FILE

print("\nCreating submission file...")

submission_df = pd.DataFrame(predictions)

# Sort by protein_id and probability
submission_df = submission_df.sort_values(['protein_id', 'probability'], 
                                          ascending=[True, False])

# Limit to 1500 terms per protein
submission_df = submission_df.groupby('protein_id').head(1500)

# Format probabilities to 3 significant figures
submission_df['probability'] = submission_df['probability'].apply(
    lambda x: f"{x:.3g}"
)

# Save submission
submission_df.to_csv('submission.tsv', sep='\t', header=False, index=False)

print(f"\nSubmission file created with {len(submission_df)} predictions")
print(f"Unique proteins: {submission_df['protein_id'].nunique()}")
print(f"Average predictions per protein: {len(submission_df) / submission_df['protein_id'].nunique():.1f}")

# Show sample
print("\nSample predictions:")
print(submission_df.head(10).to_string(index=False))

print("\nâœ“ Complete! Submission file saved as 'submission.tsv'")

In [None]:
read_sub = pd.read_csv('/kaggle/working/submission.tsv')
read_sub.head(10)