In [None]:
# Environment configuration
ENVIRONMENT = 'local'  # Change to 'kaggle' when running on Kaggle

In [None]:
%pip install pandas numpy scikit-learn -q

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

print("‚úÖ Imports successful")

In [None]:
# Set base directory
if ENVIRONMENT == 'kaggle':
    base_dir = Path("/kaggle/input/cafa-6-dataset")
else:
    base_dir = Path.cwd().parent

print(f"üìÅ Base directory: {base_dir}")

## 1. Load Training Data

In [None]:
# Load annotations
print("Loading training annotations...")
train_terms = pd.read_csv(base_dir / "Train" / "train_terms.tsv", sep='\t')

print(f"Total annotations: {len(train_terms)}")
print(f"Unique proteins: {train_terms['EntryID'].nunique()}")
print(f"Unique GO terms: {train_terms['term'].nunique()}")

print("\nFirst few rows:")
print(train_terms.head())

## 2. Load IA Weights

In [None]:
# Load Information Accretion weights
print("Loading IA weights...")
ia_df = pd.read_csv(base_dir / "IA.tsv", sep='\t')
ia_weights = dict(zip(ia_df['term'], ia_df['IA']))

print(f"IA weights available: {len(ia_weights)}")
print(f"\nExample weights:")
for term in list(ia_weights.keys())[:5]:
    print(f"  {term}: {ia_weights[term]:.4f}")

## 3. Build Frequency Model

In [None]:
# Count term frequencies
print("Counting GO term frequencies...")
term_counts = Counter(train_terms['term'])

# Convert to probabilities
total_annotations = len(train_terms)
term_probs = {term: count / total_annotations for term, count in term_counts.items()}

print(f"Total unique terms: {len(term_probs)}")
print(f"\nTop 10 most frequent terms:")
for term, count in term_counts.most_common(10):
    prob = term_probs[term]
    print(f"  {term}: {count} ({prob:.4f})")

## 4. Create Validation Set

In [None]:
# Split proteins into train/val
all_proteins = train_terms['EntryID'].unique()
train_proteins, val_proteins = train_test_split(
    all_proteins, test_size=0.2, random_state=42
)

print(f"Train proteins: {len(train_proteins)}")
print(f"Val proteins: {len(val_proteins)}")

# Get validation annotations
val_data = train_terms[train_terms['EntryID'].isin(val_proteins)]
print(f"\nValidation annotations: {len(val_data)}")

## 5. Evaluation Function

In [None]:
def evaluate_predictions(predictions_df, ground_truth_df, ia_weights_dict, threshold=0.01):
    """
    Evaluate predictions against ground truth.
    
    Args:
        predictions_df: DataFrame with columns [EntryID, term, probability]
        ground_truth_df: DataFrame with columns [EntryID, term]
        ia_weights_dict: Dict mapping terms to IA weights
        threshold: Probability threshold for predictions
    """
    # Filter predictions by threshold
    pred_filtered = predictions_df[predictions_df['probability'] >= threshold]
    
    # Group by protein
    pred_grouped = pred_filtered.groupby('EntryID')['term'].apply(set).to_dict()
    true_grouped = ground_truth_df.groupby('EntryID')['term'].apply(set).to_dict()
    
    # Compute per-protein metrics
    f1_scores = []
    precisions = []
    recalls = []
    
    for protein in true_grouped.keys():
        true_terms = true_grouped[protein]
        pred_terms = pred_grouped.get(protein, set())
        
        if len(pred_terms) == 0:
            f1_scores.append(0.0)
            precisions.append(0.0)
            recalls.append(0.0)
            continue
        
        # Compute weighted metrics
        tp_weight = sum(ia_weights_dict.get(t, 1.0) for t in true_terms & pred_terms)
        fp_weight = sum(ia_weights_dict.get(t, 1.0) for t in pred_terms - true_terms)
        fn_weight = sum(ia_weights_dict.get(t, 1.0) for t in true_terms - pred_terms)
        
        precision = tp_weight / (tp_weight + fp_weight) if (tp_weight + fp_weight) > 0 else 0
        recall = tp_weight / (tp_weight + fn_weight) if (tp_weight + fn_weight) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        f1_scores.append(f1)
        precisions.append(precision)
        recalls.append(recall)
    
    return {
        'f1': np.mean(f1_scores),
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'coverage': len([p for p in pred_grouped if len(pred_grouped[p]) > 0]) / len(true_grouped)
    }

print("‚úÖ Evaluation function defined")

## 6. Generate Predictions

In [None]:
# For each validation protein, assign all terms with their frequencies
print("Generating frequency-based predictions...")

predictions = []
for protein in val_proteins:
    for term, prob in term_probs.items():
        predictions.append({
            'EntryID': protein,
            'term': term,
            'probability': prob
        })

predictions_df = pd.DataFrame(predictions)
print(f"\nTotal predictions: {len(predictions_df)}")
print(f"Predictions per protein: {len(predictions_df) / len(val_proteins):.0f}")

## 7. Evaluate with Different Thresholds

In [None]:
print("Testing different thresholds...\n")

thresholds = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.05]
results = []

for thr in thresholds:
    metrics = evaluate_predictions(predictions_df, val_data, ia_weights, threshold=thr)
    results.append({
        'threshold': thr,
        **metrics
    })
    print(f"Threshold {thr:.4f}: F1={metrics['f1']:.4f}, P={metrics['precision']:.4f}, "
          f"R={metrics['recall']:.4f}, Coverage={metrics['coverage']:.2%}")

# Find best threshold
best_result = max(results, key=lambda x: x['f1'])
print(f"\nüèÜ Best F1: {best_result['f1']:.4f} at threshold {best_result['threshold']:.4f}")

## 8. Save Results

In [None]:
# Save results
results_df = pd.DataFrame(results)
output_path = Path("01_frequency_baseline_results.csv")
results_df.to_csv(output_path, index=False)

print(f"‚úÖ Results saved to {output_path}")
print("\nüìä Results:")
print(results_df.to_string(index=False))

## Summary

**Frequency Baseline Performance:**
- Simple approach: predict terms by training frequency
- No protein-specific information used
- Expected F1: ~0.14 (from your previous experiments)

**Next:** 02_baseline_knn.ipynb - Add sequence similarity