# CAFA-6: GOA + ProtT5 Ensemble (0.370)
**Approach:**
- GOA (Gene Ontology Annotation) database
- ProtT5 + InterPro predictions
- GOA+ propagation
**Score:** 0.370
## Required Dataset
Add dataset: `ymuroya47/cafa6-goa-predictions`
Contains:
- `goa_submission.tsv` - GOA database predictions
- `prott5_interpro_predictions.tsv` - ProtT5 + InterPro predictions

In [None]:
import os
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm

# Paths
COMPETITION_DATA = '/kaggle/input/cafa-6-protein-function-prediction'
PREDICTION_DATA = '/kaggle/input/cafa6-goa-predictions'

# Check paths
print("Competition data:", os.listdir(COMPETITION_DATA))
print("Prediction data:", os.listdir(PREDICTION_DATA))

## 1. Parse GO Ontology

In [None]:
OBO_PATH = f'{COMPETITION_DATA}/Train/go-basic.obo'

term_parents = defaultdict(set)
with open(OBO_PATH, 'r') as f:
  cur_id = None
  for line in f:
      line = line.strip()
      if line.startswith('id: '):
          cur_id = line.split('id: ')[1].strip()
      elif line.startswith('is_a: ') and cur_id:
          term_parents[cur_id].add(line.split()[1].strip())
      elif line.startswith('relationship: part_of ') and cur_id:
          term_parents[cur_id].add(line.split()[2].strip())

ROOTS = {'GO:0003674', 'GO:0008150', 'GO:0005575'}

ancestors_map = {}
def get_ancestors(term):
  if term in ancestors_map:
      return ancestors_map[term]
  parents = term_parents.get(term, set())
  all_anc = set(parents)
  for p in parents:
      all_anc |= get_ancestors(p)
  ancestors_map[term] = all_anc
  return all_anc

for term in term_parents:
  get_ancestors(term)
print(f"GO terms: {len(ancestors_map)}")

## 2. Load Predictions

In [None]:
def load_predictions(filepath):
  """Load predictions: {protein: {go_term: score}}"""
  data = defaultdict(dict)
  with open(filepath, 'r') as f:
      for line in tqdm(f, desc=f"Loading {os.path.basename(filepath)}"):
          parts = line.strip().split('\t')
          if len(parts) >= 3:
              pid, go, score = parts[0], parts[1], float(parts[2])
              data[pid][go] = max(data[pid].get(go, 0), score)
  return data

# Load GOA predictions
goa_preds = load_predictions(f'{PREDICTION_DATA}/goa_submission.tsv')
print(f"GOA proteins: {len(goa_preds)}")

# Load ProtT5 + InterPro predictions
prott5_preds = load_predictions(f'{PREDICTION_DATA}/prott5_interpro_predictions.tsv')
print(f"ProtT5+InterPro proteins: {len(prott5_preds)}")

## 3. Create Ensemble
**Weights:**
- GOA: 55% (curated database annotations)
- ProtT5+InterPro: 45% (model predictions)

In [None]:
WEIGHT_GOA = 0.55
WEIGHT_PROTT5 = 0.45

print(f"Ensemble weights: GOA={WEIGHT_GOA}, ProtT5={WEIGHT_PROTT5}")

all_proteins = set(goa_preds.keys()) | set(prott5_preds.keys())
ensemble = defaultdict(dict)

for pid in tqdm(all_proteins, desc="Creating ensemble"):
  goa = goa_preds.get(pid, {})
  prott5 = prott5_preds.get(pid, {})
  all_terms = set(goa.keys()) | set(prott5.keys())

  for term in all_terms:
      s_goa = goa.get(term, 0)
      s_prott5 = prott5.get(term, 0)

      if s_goa > 0 and s_prott5 > 0:
          # Both predict -> weighted average
          ensemble[pid][term] = WEIGHT_GOA * s_goa + WEIGHT_PROTT5 * s_prott5
      elif s_goa > 0:
          # Only GOA
          ensemble[pid][term] = s_goa
      else:
          # Only ProtT5
          ensemble[pid][term] = s_prott5

print(f"Ensemble proteins: {len(ensemble)}")

## 4. Apply GOA+ Propagation
- Positive propagation: Parent scores >= Child scores
- Negative propagation: Child scores constrained by parent
- Power scaling: Boost confidence of top predictions

In [None]:
CONFIG = {
  'NEG_PROP_ALPHA': 0.7,
  'SCALING_POWER': 0.8,
  'MAX_SCORE': 0.95,
  'TOP_K': 200,
}

def process_protein(scores_dict):
  updated = scores_dict.copy()

  # Positive propagation: ensure parent >= child
  for term, score in scores_dict.items():
      for anc in get_ancestors(term):
          updated[anc] = max(updated.get(anc, 0), score)

  # Negative propagation: constrain child by parent
  for term in list(updated.keys()):
      if term in ROOTS:
          continue
      ancs = get_ancestors(term)
      if ancs:
          anc_scores = [updated.get(a, 0) for a in ancs if a in updated]
          if anc_scores and min(anc_scores) < updated[term]:
              alpha = CONFIG['NEG_PROP_ALPHA']
              updated[term] = alpha * min(anc_scores) + (1-alpha) * updated[term]

  # Power scaling
  non_root = [s for t, s in updated.items() if t not in ROOTS]
  if non_root:
      max_val = max(non_root)
      if 0 < max_val < CONFIG['MAX_SCORE']:
          for t in updated:
              if t not in ROOTS:
                  updated[t] = min(1.0, np.power(updated[t]/max_val, CONFIG['SCALING_POWER']) * CONFIG['MAX_SCORE'])

  # Force roots to 1.0
  for r in ROOTS:
      updated[r] = 1.0

  return updated

In [None]:
final_rows = []

for pid, scores in tqdm(ensemble.items(), desc="Applying propagation"):
  updated = process_protein(scores)

  # Top-K predictions
  sorted_terms = sorted(updated.items(), key=lambda x: -x[1])[:CONFIG['TOP_K']]
  for go, s in sorted_terms:
      if s >= 0.001:
          final_rows.append(f"{pid}\t{go}\t{s:.6f}")

print(f"Total predictions: {len(final_rows):,}")

## 5. Write Submission

In [None]:
OUTPUT_FILE = 'submission.tsv'

with open(OUTPUT_FILE, 'w') as f:
  for line in final_rows:
      f.write(line + "\n")

size_mb = os.path.getsize(OUTPUT_FILE) / (1024*1024)
print(f"Submission file: {OUTPUT_FILE}")
print(f"Size: {size_mb:.1f} MB")
print(f"Predictions: {len(final_rows):,}")

## Summary
This notebook achieves **0.370** by combining:
1. **GOA Database** - Curated GO annotations from UniProt-GOA
2. **ProtT5 + InterPro** - Protein language model predictions
3. **GOA+ Propagation** - Ontology-aware post-processing