In [None]:
import pandas as pd

class Config:
    score_threshold = 0.04
    max_num_go_per_protein = 1800

def load_submission(path):
    df = pd.read_csv(path, sep='\t', header=None)
    df['pred_key'] = df[0].astype(str) + '_' + df[1].astype(str)
    return df

def merge_submissions(path_a, path_b):
    print("[1/5] Loading & merging submissions ...")
    A = load_submission(path_a)
    B = load_submission(path_b)
    B_filtered = B[~B['pred_key'].isin(A['pred_key'])]

    merged = pd.concat([A, B_filtered], ignore_index=True)
    merged = merged[[0, 1, 2]]
    merged.columns = ['ProteinID', 'GO_Term', 'Score']
    merged = merged[merged['Score'] >= Config.score_threshold]
    merged['Score'] = merged['Score'].clip(upper=1.0)
    return merged

def load_annotations(path):
    print("[2/5] Loading annotations ...")
    df = pd.read_csv(path)[['ProteinID', 'GO_Term']]
    df['Score'] = 1.0
    return df

def combine_data(submission_df, annotation_df):
    print("[3/5] Combining submission with annotations ...")
    combined = pd.concat([submission_df, annotation_df], ignore_index=True)
    combined.sort_values(by='Score', ascending=False, inplace=True)
    final = combined.drop_duplicates(
        subset=['ProteinID', 'GO_Term'], keep='first'
    ).reset_index(drop=True)
    return final

def save_output(df, output_path='submission.tsv'):
    print("[4/5] Saving final submission ...")
    df.to_csv(output_path, sep='\t', index=False, header=None)
    print("[5/5] Done.")

submission = merge_submissions(
    '/kaggle/input/gaf-submission/submission.tsv',
    '/kaggle/input/merge-of-2submission-lb-0-25/submission.tsv'
)

annotations = load_annotations(
    '/kaggle/input/protein-go-annotations-taxonomy/protein_go_annotations.csv'
)

final_submission = combine_data(submission, annotations)

save_output(final_submission)