In [None]:
import numpy as np
import pandas as pd
import gzip
import csv
import io
import argparse

import subprocess
import multiprocessing
import time

import sys
sys.path.append("/home/rashika/CAFA4/InformationAccretion/")
sys.path.append("/home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/")
from parser import *
from ia import *
from make_benchmarks import *


import os
import shutil
import concurrent.futures

In [2]:
"""
    Read a the processed annotation file, map the primary ID to a mapping file, and keep the rows that can be mapped.

    Parameters:
    - processed_file_path: Path to the file containing processed GOA annotations
    - mapping_file: Path to the mapping file 
    - out_path: Path to the output file.
    """
def goa_to_CAFA4ID(processed_file_path, mapping_file, out_path):
    ann = pd.read_csv(processed_file_path, sep = '\t', header = 0)
    mapping = pd.read_csv(mapping_file, sep = '\t', header = 0)

    # Inner join the processed annotations and the mapping based on DB Object ID
    mapped =  pd.merge(ann, mapping, on='DB Object ID', how='inner')
    
    
    # Keep the required columns
    mapped = mapped[["CAFA4_ID", "GO ID", "Aspect"]]
    
    # Write the mapped file to the out_path
    mapped.to_csv(out_path, sep = "\t", index=False, header=None)

In [5]:
def get_preprocess_cmd(gaf_path, out_path):
    cmd = [
    "python3",                 # Command to execute Python 3
    "preprocess_gaf.py",       # Script to run
    gaf_path,  # Path to input file
    "--highTP",
    "--out_path", out_path,        # Output path parameter
    #"--evidence_codes", "EXP", "IDA",   # Evidence codes parameter
    #"--extract_col_list", "DB Object ID", "Qualifier"  # Extract column list parameter
]
    return cmd

def run_process(command, log_file):
    print(" ".join(command))
    #with open(log_file, "w") as f:
    #    print(" ".join(command))
        #result = subprocess.run(" ".join(command), shell=True, stdout=f, stderr=subprocess.STDOUT)
        #if result.returncode != 0:
        #    print(f"Error running command: {' '.join(command)}. Check {log_file}")


if __name__ == "__main__":
    # Define commands and log file names
    work_dir = "/data/rashika/CAFA4/"
    
    t1_2025 = work_dir + 'uniprot/goa_2025-03-07/goa_uniprot_all.gaf.gz'
    t1_2024 = work_dir + 'uniprot/goa_2024-02-09/goa_uniprot_all.gaf.219.gz'
    t1_2023 = work_dir + 'uniprot/goa_2023-02-02/goa_uniprot_all.gaf.213.gz'
    t1_2022 = work_dir + 'uniprot/goa_2022-09-16/goa_uniprot_all.gaf.211.gz'
    t1_2021 = work_dir + 'uniprot/goa_2021-02-17/goa_uniprot_all.gaf.202.gz'
    
    t1_2025_processed = work_dir + 'extracted_goa/goa_2025-03-07/preprocessed.csv'
    t1_2024_processed = work_dir + 'extracted_goa/goa_2024-02-09/preprocessed.csv'
    t1_2023_processed = work_dir + 'extracted_goa/goa_2023-02-02/preprocessed.csv'
    t1_2022_processed = work_dir + 'extracted_goa/goa_2022-09-16/preprocessed.csv'
    t1_2021_processed = work_dir + 'extracted_goa/goa_2021-02-17/preprocessed.csv'
    
    t1_2025_log = work_dir + 'log/goa_2025-03-07.txt'
    t1_2024_log = work_dir + 'log/goa_2024-02-09.txt'
    t1_2023_log = work_dir + 'log/goa_2023-02-02.txt'
    t1_2022_log = work_dir + 'log/goa_2022-09-16.txt'
    t1_2021_log = work_dir + 'log/goa_2021-02-17.txt'
    
    
    
    t0_gaf_file = work_dir + "uniprot/goa_2019-12-18/goa_uniprot_all.gaf.195.gz" # The latest Uniprot file before t1 ( 2019-12-17)
    t0_processed = work_dir + "extracted_goa/goa_2019-12-17/preprocessed.csv"
    log_t0 =  work_dir + "log/goa_2019-12-18.txt"
    
    
    cmd_preprocess_t0 = get_preprocess_cmd(t0_gaf_file, t0_processed)
    cmd_preprocess_t1_2025 = get_preprocess_cmd(t1_2025, t1_2025_processed)
    cmd_preprocess_t1_2024 = get_preprocess_cmd(t1_2024, t1_2024_processed)
    cmd_preprocess_t1_2023 = get_preprocess_cmd(t1_2023, t1_2023_processed)
    cmd_preprocess_t1_2022 = get_preprocess_cmd(t1_2022, t1_2022_processed)
    cmd_preprocess_t1_2021 = get_preprocess_cmd(t1_2021, t1_2021_processed)
    
#     run_process(cmd_preprocess_t1_2025, t1_2025_log)
#     run_process(cmd_preprocess_t1_2024, t1_2024_log)
#     run_process(cmd_preprocess_t1_2023, t1_2023_log)
#     run_process(cmd_preprocess_t1_2022, t1_2022_log)
#     run_process(cmd_preprocess_t1_2021, t1_2021_log)
#     run_process(cmd_preprocess_t0, log_t0)
    
    mapping_file = "/data/rashika/CAFA4/CAFA4-export/AC2CAFA4ID.map"
    mapped_path_25 = work_dir + "mapped/2025-03-07/t1_2025.csv"
    mapped_path_24 = work_dir + "mapped/2024-02-09/t1_2024.csv"
    mapped_path_23 = work_dir + "mapped/2023-02-02/t1_2023.csv"
    mapped_path_22 = work_dir + "mapped/2022-09-16/t1_2022.csv"
    mapped_path_21 = work_dir + "mapped/2021-02-17/t1_2021.csv"
    mapped_path_19 = work_dir + "mapped/2019-12-17/t0_2019.csv"
    
    
    # Map to CAFA4 IDs 
    #goa_to_CAFA4ID(t0_processed , mapping_file, mapped_path_19)
    #goa_to_CAFA4ID(t1_2025_processed , mapping_file, mapped_path_25)
    #goa_to_CAFA4ID(t1_2024_processed , mapping_file, mapped_path_24)
    #goa_to_CAFA4ID(t1_2023_processed , mapping_file, mapped_path_23)
    goa_to_CAFA4ID(t1_2022_processed , mapping_file, mapped_path_22)
    goa_to_CAFA4ID(t1_2021_processed , mapping_file, mapped_path_21)
    goa_to_CAFA4ID(t0_processed, mapping_file, mapped_path_19)
    
    
    # Create the benchmarks
    roots = {'BPO': 'GO:0008150', 'CCO': 'GO:0005575', 'MFO': 'GO:0003674'}

    #eval_path = work_dir + "eval/"

    t0_ont_file = '/data/rashika/CAFA4/obo/go_2019_12_09/go-basic.obo' # data-version: releases/2020-01-01     
    t1_2025_ont_file = "/data/rashika/CAFA4/obo/go_2025-02-06/go-basic.obo"
    t1_2024_ont_file = "/data/rashika/CAFA4/obo/go_2024-01-17/go-basic.obo"
    t1_2023_ont_file = "/data/rashika/CAFA4/obo/go_2023-01-01/go-basic.obo"
    t1_2022_ont_file = "/data/rashika/CAFA4/obo/go_2022-07-01/go-basic.obo"
    t1_2021_ont_file = "/data/rashika/CAFA4/obo/go_2021-02-01/go-basic.obo"
    
    
    t0_ont_graph = clean_ontology_edges(obonet.read_obo(t0_ont_file))
    t1_2025_ont_graph = clean_ontology_edges(obonet.read_obo(t1_2025_ont_file)) 
    t1_2024_ont_graph = clean_ontology_edges(obonet.read_obo(t1_2024_ont_file)) 
    t1_2023_ont_graph = clean_ontology_edges(obonet.read_obo(t1_2023_ont_file)) 
    t1_2022_ont_graph = clean_ontology_edges(obonet.read_obo(t1_2022_ont_file)) 
    t1_2021_ont_graph = clean_ontology_edges(obonet.read_obo(t1_2021_ont_file))
    
    t_minus_ont_file =  "/data/rashika/CAFA4/obo/go_2019_10_07/go-basic.obo"
    t_minus_1_ont_graph = clean_ontology_edges(obonet.read_obo(t_minus_ont_file))

    # Create BM lists
    GT_2025_path = work_dir + "ground_truth_lists/2025-03-07/C4/"
    GT_2024_path = work_dir + "ground_truth_lists/2024-02-09/C4/"
    GT_2023_path = work_dir + "ground_truth_lists/2023-02-02/C4/"
    GT_2022_path = work_dir + "ground_truth_lists/2022-09-16/C4/"
    GT_2021_path = work_dir + "ground_truth_lists/2021-02-17/C4/"
    
    #create_bm_lists('/data/rashika/CAFA4/mapped/t0.csv', mapped_path_25, t0_ont_graph, t1_2025_ont_graph, t_minus_1_ont_graph, roots, GT_2025_path, GT_2025_path, remove_protein_binding = True)
    #create_bm_lists('/data/rashika/CAFA4/mapped/t0.csv', mapped_path_24, t0_ont_graph, t1_2024_ont_graph, t_minus_1_ont_graph, roots, GT_2024_path, GT_2024_path, remove_protein_binding = True)
    #create_bm_lists('/data/rashika/CAFA4/mapped/t0.csv', mapped_path_23, t0_ont_graph, t1_2023_ont_graph, t_minus_1_ont_graph, roots, GT_2023_path, GT_2023_path, remove_protein_binding = True)
    create_bm_lists('/data/rashika/CAFA4/mapped/t0.csv', mapped_path_22, t0_ont_graph, t1_2022_ont_graph, t_minus_1_ont_graph, roots, GT_2022_path, GT_2022_path, remove_protein_binding = True)
    create_bm_lists('/data/rashika/CAFA4/mapped/t0.csv', mapped_path_21, t0_ont_graph, t1_2021_ont_graph, t_minus_1_ont_graph, roots, GT_2021_path, GT_2021_path, remove_protein_binding = True)

    

/data/rashika/CAFA4/ground_truth_lists/2022-09-16/C4/type1.txt
/data/rashika/CAFA4/ground_truth_lists/2022-09-16/C4/type2.txt
/data/rashika/CAFA4/ground_truth_lists/2022-09-16/C4/type3.txt
/data/rashika/CAFA4/ground_truth_lists/2022-09-16/C4/type12.txt
/data/rashika/CAFA4/ground_truth_lists/2021-02-17/C4/type1.txt
/data/rashika/CAFA4/ground_truth_lists/2021-02-17/C4/type2.txt
/data/rashika/CAFA4/ground_truth_lists/2021-02-17/C4/type3.txt
/data/rashika/CAFA4/ground_truth_lists/2021-02-17/C4/type12.txt


In [None]:
#python3 /home/rashika/CAFA4/stealth_eval/code/preprocess_gaf.py /data/rashika/CAFA4/uniprot/goa_2024-02-09/goa_uniprot_all.gaf.219.gz --highTP --out_path /data/rashika/CAFA4/extracted_goa/goa_2024-02-09/preprocessed.csv > /data/rashika/CAFA4/log/goa_2024-02-09.txt 2>&1 &

#python3 /home/rashika/CAFA4/stealth_eval/code/preprocess_gaf.py /data/rashika/CAFA4/uniprot/goa_2023-02-02/goa_uniprot_all.gaf.213.gz --highTP --out_path /data/rashika/CAFA4/extracted_goa/goa_2023-02-02/preprocessed.csv > /data/rashika/CAFA4/log/goa_2023-02-02.txt 2>&1 
#python3 /home/rashika/CAFA4/stealth_eval/code/preprocess_gaf.py /data/rashika/CAFA4/uniprot/goa_2022-09-16/goa_uniprot_all.gaf.211.gz --highTP --out_path /data/rashika/CAFA4/extracted_goa/goa_2022-09-16/preprocessed.csv > /data/rashika/CAFA4/log/goa_2022-09-16.txt 2>&1 
#python3 /home/rashika/CAFA4/stealth_eval/code/preprocess_gaf.py /data/rashika/CAFA4/uniprot/goa_2021-02-17/goa_uniprot_all.gaf.202.gz --highTP --out_path /data/rashika/CAFA4/extracted_goa/goa_2021-02-17/preprocessed.csv > /data/rashika/CAFA4/log/goa_2021-02-17.txt 2>&1 
#python3 /home/rashika/CAFA4/stealth_eval/code/preprocess_gaf.py /data/rashika/CAFA4/uniprot/goa_2019-12-18/goa_uniprot_all.gaf.195.gz --highTP --out_path /data/rashika/CAFA4/extracted_goa/goa_2019-12-17/preprocessed.csv > /data/rashika/CAFA4/log/goa_2019-12-18.txt 2>&1 


In [None]:
mapped_path_25

In [None]:
t1_2025_processed