In [None]:
import pandas as pd

In [None]:
class config:
    score_threshold = 0.04
    max_num_go_per_protein = 1800

### ‚è≥ Load Data

In [None]:
def load_submission(path):
    df = pd.read_csv(path, sep='\t', header=None)
    df['pred_key'] = df[0].astype(str) + '_' + df[1].astype(str)
    return df

In [None]:
print(f"[1/5] Loading Data ...")
A = load_submission('/kaggle/input/gaf-submission/submission.tsv')  # LB Score: 0.269
B = load_submission('/kaggle/input/merge-of-2submission-lb-0-25/submission.tsv')  # LB Score: 0.250

print(f"[2/5] Filtering B ...")
A_idx = A.set_index('pred_key')
B_idx = B.set_index('pred_key')
# Keep only rows in B where pred_key is not in A
B_filtered = B[~B['pred_key'].isin(A['pred_key'])]

print(f"[3/5] Merging ..")
submission = pd.concat([A, B_filtered], ignore_index=True)

print(f"[4/5] Finalizing format ..")
submission = submission.reset_index(drop=True)[[0, 1, 2]]

print(f"[5/5] Submission shape: {submission.shape}")

### üßπ Post-Processing

In [None]:
# Handle very low scores and scores > 1
submission.columns = ['ProteinID', 'GO_Term', 'Score']
submission = submission[submission['Score'] >= config.score_threshold]
submission['Score'] = submission['Score'].clip(upper=1.0)
submission.shape

In [None]:
# # Keep only 1800 GO per Protein
# submission = (
#     submission.sort_values(['ProteinID', 'Score'], ascending=[True, False])
#     .groupby('ProteinID', group_keys=False)
#     .head(config.max_num_go_per_protein)
# )
# submission.shape

### üîó Merge 

In [None]:
# GT data collected using QuickGO API
protein_go_annotations = pd.read_csv('/kaggle/input/protein-go-annotations-taxonomy/protein_go_annotations.csv')
protein_go_annotations = protein_go_annotations[['ProteinID', 'GO_Term']]
protein_go_annotations['Score'] = round(1.0, 3)

In [None]:
print(f'[‚è≥] Combining Submissions ...')
combined = pd.concat([submission, protein_go_annotations], ignore_index=True)

# Drop duplicates,
# Keep the ground truth score (1.0) if overlap
combined.sort_values(by='Score', ascending=False, inplace=True)
final_submission = combined.drop_duplicates(subset=['ProteinID', 'GO_Term'], keep='first').reset_index(drop=True)
final_submission.shape

### üì§ Submit

In [None]:
print(f'[‚è≥] Saving Submission ...')
final_submission.to_csv('submission.tsv',sep='\t', index=False, header=None)
print(f"[+] Done.")