# CAFA-6 Blend GOA Negative Propagation

This notebook implements ensemble blending of multiple model submissions with GOA Uniprot data and negative annotation filtering.

## Import Libraries

In [None]:
import os, gc
from collections import defaultdict

import pandas as pd
from tqdm.auto import tqdm
import numpy as np

## Utility Functions

In [None]:
def read_train_terms(path):
    mapping = defaultdict(list)
    df = pd.read_csv(path, sep="\t", header=None, names=["protein","go","ont"], dtype=str)
    for _, r in tqdm(df.iterrows(), total=len(df)): 
        mapping[r.protein].append(r.go)
    print(f"[io] Read training annotations for {len(mapping)} proteins from {path}")
    return mapping

def parse_obo(go_obo_path):
    parents = defaultdict(set)
    children = defaultdict(set)
    
    if not os.path.exists(go_obo_path): 
        return parents, children
        
    with open(go_obo_path,"r") as f:
        cur_id=None
        for line in f:
            line=line.strip()
            if line=="[Term]": 
                cur_id=None
            elif line.startswith("id: "): 
                cur_id=line.split("id: ")[1].strip()
            elif line.startswith("is_a: "):
                pid=line.split()[1].strip()
                if cur_id: 
                    parents[cur_id].add(pid)
                    children[pid].add(cur_id)
            elif line.startswith("relationship: part_of "):
                parts=line.split(); 
                if len(parts)>=3:
                    pid=parts[2].strip()
                    if cur_id: 
                        parents[cur_id].add(pid)
                        children[pid].add(cur_id)
    print(f"[io] Parsed OBO: {len(parents)} nodes with parents")
    return parents, children

def get_ancestors(go_id, parents):
    ans=set()
    stack=[go_id]
    while stack:
        cur=stack.pop()
        for p in parents.get(cur,[]): 
            if p not in ans:
                ans.add(p)
                stack.append(p)
    return ans

def get_descendants(go_id):
    desc = set()
    stack = [go_id]
    while stack:
        cur = stack.pop()
        for child in children_map.get(cur, []):
            if child not in desc:
                desc.add(child)
                stack.append(child)
    return desc

## Load Submissions and GOA Data

### Define Submission Paths

In [None]:
train_terms = read_train_terms("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv")
parents_map, children_map = parse_obo("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

### Load GOA Uniprot Data

### Load GO Ontology

In [None]:
go_annotations = pd.read_csv('/kaggle/input/protein-go-annotations/goa_uniprot_all.csv')
go_annotations = go_annotations.drop_duplicates()
print(f'[+] Dataset shape: {go_annotations.shape}')
go_annotations.head()

In [None]:
go_annotations.qualifier.value_counts()

## Negative Annotation Processing

Negative annotations (NOT qualifiers) indicate that a protein does NOT have a specific function.
These annotations should be used to filter out incorrect predictions.

In [None]:
print(f"[1/3] Filtering Negative Annotations ..")
negative_annots = go_annotations[go_annotations['qualifier'].str.contains('NOT', na=False)]
negative_annots = negative_annots.drop(columns=['qualifier']).drop_duplicates()

print(f"[2/3] Propagate Negative Terms ..")
negative_annots = negative_annots.groupby('protein_id')['go_term'].apply(list).to_dict()

propagated={}
for p in tqdm(negative_annots.keys()):
    terms=set(negative_annots[p])
    extra=set()
    for t in list(terms): 
        extra |= get_descendants(t)
    propagated[p] = sorted(terms | extra)
        
negative_annots = propagated

print(f"[3/3] Extract Unique Keys ..")
rows = [(protein_id, go_term) for protein_id, terms in negative_annots.items() for go_term in terms]
negative_df = pd.DataFrame(rows, columns=["protein_id", "go_term"])
negative_df['pred_key'] = negative_df.protein_id.apply(str) + '_' + negative_df.go_term.apply(str)
negative_keys = set(negative_df['pred_key'])

del negative_df
gc.collect()

print(f"Total unique negative protein-GO pairs: {len(negative_keys)}")

In [None]:
print(f"[1/4] Loading GOA Annotations ..")
go_annotations = pd.read_csv('/kaggle/input/protein-go-annotations/goa_uniprot_all.csv')

print(f"[2/4] Removing unwanted annotations ..")
go_annotations = go_annotations[~go_annotations['qualifier'].str.contains('NOT', na=False)]
go_annotations.drop(columns=['qualifier'], inplace=True)
go_annotations = go_annotations.drop_duplicates()

print(f"[3/4] Set Ground-Truth Score ..")
go_annotations['score'] = round(1.0, 3)

print(f"[4/4] Setting Key ..")
go_annotations['pred_key'] = go_annotations['protein_id'].astype(str) + '_' + go_annotations['go_term'].astype(str)
go_annotations = go_annotations[~go_annotations['pred_key'].isin(negative_keys)]
goa_pred_keys = set(go_annotations['pred_key'])
print(f"[+] Total unique ground truth protein-GO pairs: {len(goa_pred_keys)}")
print(f"[âœ…] Done.")

## Ensemble Blending

### Load and Blend Submissions

In [None]:
def get_num_rows(file_path):
    with open(file_path, 'r') as f:
        num_rows = sum(1 for line in f)
    return num_rows
    

def load_submission(path, chunksize=50000, num_rows=None):
    if num_rows is None:  
        num_rows = get_num_rows(path)
        
    total = int(num_rows / chunksize) + 1 
    chunks = []
    
    for chunk in tqdm(pd.read_csv(path, sep='\t', header=None, chunksize=chunksize), total=total):
        chunk['pred_key'] = chunk[0].astype(str) + '_' + chunk[1].astype(str)
        chunks.append(chunk)
    
    df = pd.concat(chunks, ignore_index=True)
    df.columns = ['protein_id', 'go_term', 'score', 'pred_key']
    return df

In [None]:
print(f"[1/2] Loading 1st submission ..")
A = load_submission('/kaggle/input/nnn-kmer-tfidf-sgd/submission.tsv')
A.drop(A.index[A['score'] < 0.04], inplace=True)
A['score'] = A['score'].clip(upper=1.0)

print(f"[2/2] Loading 2nd submission ..")
B = load_submission('/kaggle/input/nnn-protbert-and-kmer-td-idf-fusion/submission.tsv')
B.dropna(inplace=True)

A.shape, B.shape

## GOA Data Integration

### Merge GOA Annotations

In [None]:
print(f"[1/4] Removing Ground-Truth from A ..")
A = A[~A.pred_key.isin(goa_pred_keys)]
print(f"[2/4] Removing Ground-Truth from B ..")
B = B[~B.pred_key.isin(goa_pred_keys)]

print(f"[3/4] Removing Negatives from A ..")
A = A[~A.pred_key.isin(negative_keys)]
print(f"[4/4] Removing Negatives from B ..")
B = B[~B.pred_key.isin(negative_keys)]

## Negative Annotation Filtering

### Apply Negative Filters

In [None]:
print(f"[1/3] Intersection keys ..")
A_keys = set(A.pred_key)
B_keys = set(B.pred_key)
intersect_keys = A_keys & B_keys

print(f"[2/3] Intersection ..")

# Leaderboard scores
wa = 0.255
wb = 0.213

A_inter = A[A.pred_key.isin(intersect_keys)].copy()
B_inter = B[B.pred_key.isin(intersect_keys)].copy()

inter = A_inter.merge(
    B_inter[['pred_key','score']],
    on='pred_key',
    suffixes=('_a','_b')
)

print(f"[3/3] Weighted average sum ..")
inter['score'] = (inter['score_a'] * wa + inter['score_b'] * wb) / (wa + wb)
inter.drop(columns = ['score_a', 'score_b'], inplace=True)
print(f"[*] Done.")
inter.shape

In [None]:
print(f"[1/2] Add missing rows ..")
AnotB = A[~A.pred_key.isin(B_keys)]
BnotA = B[~B.pred_key.isin(A_keys)]

print(f"[2/2] Merging ..")
submission = pd.concat([go_annotations, inter, AnotB, BnotA], axis=0)
submission.drop(columns=['pred_key'], inplace=True)
submission.shape

## Write Final Submission

### Save Submission File

In [None]:
print(f'[*] Saving submission...')
submission.to_csv('submission.tsv',sep='\t', index=False, header=None)
print(f"[*] Done.")

In [None]:
!head submission.tsv