The most common reasons a **CAFA (or CAFA-Kaggle)** submission fails are:

| Problem | Description | Symptom |
|----------|--------------|----------|
| **Incorrect columns** | Must be exactly **3 columns**, tab-separated: `UniProt_ID`, `GO_term`, `Score` | Any space, extra column, or comma breaks it |
| **Header line included** | The competition expects **no header** | “Header line detected” error |
| **Invalid GO IDs** | Must match `GO:\d{7}` pattern | Error about “invalid GO term” |
| **Invalid protein IDs** | Must match list in the provided targets | “Unknown protein” error |
| **Score format** | Must be a valid float between `0` and `1`, using `.` as decimal | “Invalid score” or “parsing error” |
| **Duplicate (Protein, GO) pairs** | Each combination should appear **once** | Error about duplicates |
| **File encoding / line endings** | Should be **UTF-8**, UNIX (`\n`) line endings | “Could not parse submission” after long wait |

This scripts attempts to find the errors and correct them.

In [None]:
import pandas as pd
import sys
import os
import re


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
os.chdir(CURRENT_DIR)
if CURRENT_DIR not in sys.path:
    sys.path.insert(0, CURRENT_DIR)
SUBMISSION_FILE = '../data/submission.tsv'
SAMPLE_SUBMISSION = '../data/sample_submission.tsv'
FASTA_FILE = '../data/testsuperset.fasta'


def validate_submission(path):
    df = pd.read_csv(path, sep='\t', header=None, dtype=str)
    print(f"Loaded {len(df):,} lines")

    # --- Structure ---
    if df.shape[1] != 3:
        print(f"❌ Invalid number of columns ({df.shape[1]}). Expected 3.")
        return

    df.columns = ["UniProt_ID", "GO_term", "Score"]

    # --- GO term format ---
    invalid_go = ~df["GO_term"].str.match(r"^GO:\d{7}$")
    if invalid_go.any():
        print(f"❌ Invalid GO IDs found (examples: {df[invalid_go].head(5).to_dict('records')})")
    else:
        print("✅ All GO terms valid.")

    # --- Score validity ---
    try:
        df["Score"] = df["Score"].astype(float)
    except Exception as e:
        print(f"❌ Could not parse some scores as floats: {e}")
        return

    out_of_range = (df["Score"] < 0) | (df["Score"] > 1)
    if out_of_range.any():
        print(f"❌ Found {out_of_range.sum()} scores outside [0,1].")
    else:
        print("✅ All scores in [0,1].")

    # --- Duplicates ---
    subset=["UniProt_ID", "GO_term"]
    dups = df.duplicated(subset=subset)
    if dups.any():
        print(f"❌ Found {dups.sum()} duplicate (UniProt, GO) pairs.")
        df_new = df.drop_duplicates(subset=subset, keep="first")
        df_new.to_csv("submission_nodup.tsv", sep="\t", header=False, index=False)
        print(f"Removed duplicates. New file has {len(df_new):,} lines.")
        exit(0)
    else:
        print("✅ No duplicate pairs.")

    # --- Reference protein IDs ---
    ref_ids = set()
    with open(FASTA_FILE) as f:
        for line in f:
            if line.startswith(">"):
                ref_ids.add(re.match(r">(\S+)", line).group(1))
    #ref = pd.read_csv(SAMPLE_SUBMISSION, sep='\t', header=None, usecols=[0], names=['UniProt_ID'])
    #ref_ids = set(ref['UniProt_ID'])
    missing = df.loc[~df['UniProt_ID'].isin(ref_ids)] #Find rows where the UniProt_ID is are not in ref_ids.
    if not missing.empty:
        print(f"⚠️ {len(missing)} proteins not in reference list (example: {missing['UniProt_ID'].iloc[0]})")
        df = df[df["UniProt_ID"].isin(ref_ids)]
        df.to_csv("submission_filtered.tsv", sep="\t", header=False, index=False)
        print(f"Filtered submission now has {len(df):,} lines (valid proteins only).")
    else:
        print("✅ All protein IDs match reference list.")

    print("✅ Validation complete — format looks correct if no ❌ shown.")

if __name__ == "__main__":
    validate_submission(SUBMISSION_FILE)