In [11]:
import numpy as np
import pandas as pd
import gzip
import csv
import io
import argparse

import subprocess
import multiprocessing
import time

import sys
sys.path.append("/home/rashika/CAFA4/InformationAccretion/")
sys.path.append("/home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/")
from parser import *
from ia import *
from make_benchmarks import *

In [12]:
ls /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/

[0m[01;34mdata[0m/          graph.py     __main__.py  [01;34m__pycache__[0m/
evaluation.py  __init__.py  parser.py


In [13]:
# Preprocess
#preprocess_cmd = "python3 run_preprocess_parallel.py"
#result = subprocess.run(preprocess_cmd, shell=True)

In [14]:
# To do:
# Modify preprocess_cmd to take all the input/output/log paths as inputs
# Incorporate in the code of main() block

# To do 

Do the system commands more elegantly, parallelise the evaluation, incorporate the bootstrapping

In [15]:
"""
    Read a the processed annotation file, map the primary ID to a mapping file, and keep the rows that can be mapped.

    Parameters:
    - processed_file_path: Path to the file containing processed GOA annotations
    - mapping_file: Path to the mapping file 
    - out_path: Path to the output file.
    """
def goa_to_CAFA4ID(processed_file_path, mapping_file, out_path):
    ann = pd.read_csv(processed_file_path, sep = '\t', header = 0)
    mapping = pd.read_csv(mapping_file, sep = '\t', header = 0)

    # Inner join the processed annotations and the mapping based on DB Object ID
    mapped =  pd.merge(ann, mapping, on='DB Object ID', how='inner')
    
    
    # Keep the required columns
    mapped = mapped[["CAFA4_ID", "GO ID", "Aspect"]]
    
    # Write the mapped file to the out_path
    mapped.to_csv(out_path, sep = "\t", index=False, header=None)

In [17]:
def get_preprocess_cmd(gaf_path, out_path):
    cmd = [
    "python3",                 # Command to execute Python 3
    "preprocess_gaf.py",       # Script to run
    gaf_path,  # Path to input file
    "--highTP",
    "--out_path", out_path,        # Output path parameter
    #"--evidence_codes", "EXP", "IDA",   # Evidence codes parameter
    #"--extract_col_list", "DB Object ID", "Qualifier"  # Extract column list parameter
]
    return cmd

def run_process(command, log_file):
    with open(log_file, "w") as f:
        print(command)
        result = subprocess.run(" ".join(command), shell=True, stdout=f, stderr=subprocess.STDOUT)


if __name__ == "__main__":
    # Define commands and log file names
    work_dir = "/data/rashika/CAFA4/"
    
    t0_gaf_file = work_dir + "uniprot/raw_goa/t0/goa_uniprot_all.gaf.195.gz" # The latest Uniprot file before t1 ( 2019-12-17)
    t0_processed = work_dir + "extracted_goa/t0_preprocessed.csv"
    log_t0 =  work_dir + "log/log_preprocess_t0.txt"
    
    #t1_gaf_file = work_dir + "uniprot/raw_goa/t1/goa_uniprot_all.gaf.gz" # The file from UniProt (2024-02-09)
    t1_gaf_file = work_dir + "uniprot/uniprot_2024_2024-04-16/goa_uniprot_all.gaf.gz" # The file from UniProt 2024-04-16
    t1_processed = work_dir + "extracted_goa/t1_preprocessed.csv"
    log_t1 = work_dir + "log/log_preprocess_t1.txt"
    
    
    cmd_preprocess_t0 = get_preprocess_cmd(t0_gaf_file, t0_processed)
    cmd_preprocess_t1 = get_preprocess_cmd(t1_gaf_file, t1_processed)
    
    # Create processes for each command
    process1 = multiprocessing.Process(target=run_process, args=(cmd_preprocess_t0, log_t0))
    process2 = multiprocessing.Process(target=run_process, args=(cmd_preprocess_t1, log_t1))
    
    # Start the processes
    #process1.start()
    #process2.start()

    # Wait for both processes to finish
    #process1.join()
    #process2.join()

    #print("Both processes have finished.")
    
    # Map the IDs of the processed 
    
    work_dir = "/data/rashika/CAFA4/"
    mapping_file = "/data/rashika/CAFA4/CAFA4-export/AC2CAFA4ID.map"
    t0_mapped_path = work_dir + "mapped/t0.csv"
    t1_mapped_path = work_dir + "/mapped/t1.csv"
    
    # Map to CAFA4 IDs 
    #goa_to_CAFA4ID(t0_processed , mapping_file, t0_mapped_path)
    #goa_to_CAFA4ID(t1_processed , mapping_file, t1_mapped_path)
    
    # Create the benchmarks
    roots = {'BPO': 'GO:0008150', 'CCO': 'GO:0005575', 'MFO': 'GO:0003674'}

    #eval_path = work_dir + "eval/"

    t0_ont_file = '/data/rashika/CAFA4/uniprot/go_2019_12_09/go-basic.obo' # data-version: releases/2020-01-01
    t0_ont_graph = clean_ontology_edges(obonet.read_obo(t0_ont_file))     
    
    t1_ont_file = "/data/rashika/CAFA4/uniprot/go_2024_03_28/go-basic.obo"
    t1_ont_graph = clean_ontology_edges(obonet.read_obo(t1_ont_file)) # data-version: releases/2024-01-17
    
    t_minus_ont_file =  "/data/rashika/CAFA4/uniprot/go_2019_10_07/go-basic.obo"
    t_minus_1_ont_graph = clean_ontology_edges(obonet.read_obo(t_minus_ont_file))

    # Create BM lists
    eval_path = '/data/rashika/CAFA4/eval_final/'
    BM_GO_path = eval_path + "BM_GO/"
    common_path = '/data/rashika/CAFA4/common/'
    #create_bm_lists(t0_mapped_path, t1_mapped_path, t0_ont_graph, t1_ont_graph, t_minus_1_ont_graph, roots, BM_GO_path, common_path)


    # Calculate IA
    IA_file =  eval_path + "IA.txt"
    #print(IA_file)
    #cmd = 'python3 /home/rashika/CAFA4/InformationAccretion/ia.py --annot ' + t0_processed + ' --graph '+ t_minus_ont_file + ' --outfile ' + IA_file + ' --prop' 
    #os.system(cmd)
    
    pred_dir = "/data/yisupeng/sharing/cafa4/all_models/"
    
    result_path = eval_path + "eval_results/"
    #run_eval(BM_GO_path, pred_dir, t_minus_ont_file, IA_file, result_path, log_path = '/home/rashika/CAFA4/eval/log/', thresh_step = 0.001)
    #cmd = 'python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py /data/rashika/CAFA4/uniprot/go_2019_10_07/go-basic.obo /data/yisupeng/sharing/cafa4/all_models/ /data/rashika/CAFA4/eval_final/BM_GO/bpo_all_type3.txt -out_dir /data/rashika/CAFA4/eval_final/eval_results/bpo_all_type3/ -ia /data/rashika/CAFA4/eval_final/IA.txt -prop max -th_step 0.01 -no_orphans > /home/rashika/CAFA4/eval/log/bpo_all_type3/run.log &'
    os.system(cmd)
    # Paths
    plots_path =  eval_path + 'plots_ALL/'
    if not os.path.exists(plots_path):
        os.mkdir(plots_path)
    plots_path_f_w = plots_path+'f_w/'
    plots_path_f = plots_path+'f/'
    plots_path_f_micro = plots_path+'f_micro/'
    plots_path_f_micro_w = plots_path+'f_micro_w/'
    plots_path_s_w = plots_path+'s_w/'
    register = '/data/rashika/CAFA4/file_map.tsv'


#     metric, cols = ('f_w', ['rc_w', 'pr_w'])
#     create_plots(result_path, metric, cols, out_path= plots_path_f_w, n_curves = 10, names_file = register)

#     metric, cols = ('f', ['rc', 'pr'])
#     create_plots(result_path, metric, cols, out_path = plots_path_f, n_curves = 10, names_file =register)

#     metric, cols =  ('f_micro_w', ['rc_micro_w', 'pr_micro_w'])
#     create_plots(result_path, metric, cols, out_path = plots_path_f_micro_w, n_curves = 10, names_file =register)

#     metric, cols =  ('f_micro', ['rc_micro', 'pr_micro'])
#     create_plots(result_path, metric, cols, out_path = plots_path_f_micro, n_curves = 10, names_file =register)

#     metric, cols = ('s_w', ['ru_w', 'mi_w'])
#     create_plots(result_path, metric, cols, out_path = plots_path_s_w, n_curves = 10, names_file =register)

2024-06-27 23:37:39,113 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, cellular_component, proteins 1076, annotations 97890, replaced alt. ids 2042
2024-06-27 23:37:54,785 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, cellular_component, proteins 2308, annotations 153266, replaced alt. ids 0
2024-06-27 23:37:57,313 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, evaluated
2024-06-27 23:37:59,212 [INFO ] Ontology: biological_process, total 29456, roots 1, leaves 13559, alternative_ids 1474
2024-06-27 23:38:00,083 [INFO ] Ontology: molecular_function, total 11092, roots 1, leaves 9061, alternative_ids 897
2024-06-27 23:38:00,203 [INFO ] Ontology: cellular_component, total 4182, roots 1, leaves 2775, alternative_ids 181
2024-06-27 23:38:00,510 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, evaluated
2024-06-27 23:38:16,747 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M118, evaluated
2024-06-27 23:38

2024-06-27 23:45:51,045 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, biological_process, proteins 1455, annotations 807123, replaced alt. ids 0
2024-06-27 23:45:51,447 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, molecular_function, proteins 2682, annotations 291202, replaced alt. ids 0
2024-06-27 23:46:33,044 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M118, evaluated
2024-06-27 23:46:35,823 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M065, cellular_component, proteins 1142, annotations 796673, replaced alt. ids 0
2024-06-27 23:46:40,863 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M065, evaluated
2024-06-27 23:46:43,888 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M077, cellular_component, proteins 758, annotations 8495, replaced alt. ids 0
2024-06-27 23:46:46,724 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M065, cellular_component, proteins 1166, annotations 810947

2024-06-27 23:52:57,277 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M005, evaluated
2024-06-27 23:53:06,061 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M086, cellular_component, proteins 1070, annotations 23106, replaced alt. ids 0
2024-06-27 23:53:11,142 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M132, evaluated
2024-06-27 23:53:12,033 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M005, evaluated
2024-06-27 23:53:20,403 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M086, cellular_component, proteins 974, annotations 20855, replaced alt. ids 0
2024-06-27 23:53:34,885 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M005, cellular_component, proteins 1159, annotations 159472, replaced alt. ids 0
2024-06-27 23:53:50,461 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M118, evaluated
2024-06-27 23:54:04,326 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, evaluated
2024-06-

2024-06-27 23:59:57,632 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-27 23:59:57,643 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M083, not evaluated
2024-06-28 00:00:23,104 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M045, evaluated
2024-06-28 00:00:24,353 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, evaluated
2024-06-28 00:00:31,838 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, evaluated
2024-06-28 00:00:35,203 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M120, cellular_component, proteins 1142, annotations 509731, replaced alt. ids 0
2024-06-28 00:00:48,090 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M120, cellular_component, proteins 1166, annotations 520511, replaced alt. ids 0
2024-06-28 00:01:00,902 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, molecular_function, proteins 4414, annotations 361899, replaced alt. ids 16687
2024-

2024-06-28 00:08:18,217 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M138, evaluated
2024-06-28 00:08:44,695 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M052, evaluated
2024-06-28 00:08:45,316 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M132, cellular_component, proteins 27, annotations 270, replaced alt. ids 0
2024-06-28 00:09:08,155 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M132, evaluated
2024-06-28 00:09:30,450 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M005, cellular_component, proteins 2275, annotations 312821, replaced alt. ids 0
2024-06-28 00:09:36,142 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M012, evaluated
2024-06-28 00:09:36,194 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M032, cellular_component, proteins 1142, annotations 47263, replaced alt. ids 0
2024-06-28 00:09:42,884 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, evaluated
2024-06-28 

2024-06-28 00:16:14,665 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M108, evaluated
2024-06-28 00:16:19,873 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M064, cellular_component, proteins 795, annotations 16813, replaced alt. ids 0
2024-06-28 00:16:33,038 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M012, molecular_function, proteins 2399, annotations 57075, replaced alt. ids 0
2024-06-28 00:16:43,372 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, evaluated
2024-06-28 00:17:11,212 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, evaluated
2024-06-28 00:17:11,977 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M108, evaluated
2024-06-28 00:17:13,221 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, evaluated
2024-06-28 00:17:15,772 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, molecular_function, proteins 4697, annotations 526231, replaced alt. ids 0
2024-06-

2024-06-28 00:23:40,080 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M022, evaluated
2024-06-28 00:23:53,532 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 00:23:53,562 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M083, not evaluated
2024-06-28 00:23:56,176 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M092, evaluated
2024-06-28 00:23:57,890 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M067, cellular_component, proteins 1149, annotations 127865, replaced alt. ids 0
2024-06-28 00:24:00,034 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M067, cellular_component, proteins 1087, annotations 130429, replaced alt. ids 0
2024-06-28 00:24:09,454 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M012, evaluated
2024-06-28 00:24:15,449 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, cellular_component, proteins 14739, annotations 1287576, replaced alt. ids 28512
202

2024-06-28 00:29:49,836 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M020, cellular_component, proteins 1166, annotations 57997, replaced alt. ids 0
2024-06-28 00:29:51,395 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M020, evaluated
2024-06-28 00:29:52,915 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 00:29:52,936 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M142, not evaluated
2024-06-28 00:29:53,346 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, biological_process, proteins 2771, annotations 1702642, replaced alt. ids 0
2024-06-28 00:30:00,518 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M056, biological_process, proteins 1455, annotations 3999249, replaced alt. ids 0
2024-06-28 00:30:02,385 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M122, cellular_component, proteins 367, annotations 23067, replaced alt. ids 0
2024-06-28 00:30:21,595 [INFO ] Prediction: /

2024-06-28 00:36:18,874 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M080, evaluated
2024-06-28 00:36:26,865 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M049, cellular_component, proteins 1166, annotations 72830, replaced alt. ids 0
2024-06-28 00:36:35,739 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M080, cellular_component, proteins 1166, annotations 105477, replaced alt. ids 3920
2024-06-28 00:36:45,397 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M080, evaluated
2024-06-28 00:36:47,432 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M031, evaluated
2024-06-28 00:36:49,662 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, cellular_component, proteins 15882, annotations 1064202, replaced alt. ids 0
2024-06-28 00:36:49,806 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M052, molecular_function, proteins 2016, annotations 60096, replaced alt. ids 0
2024-06-28 00:36:52,266 [INFO ] Predi

2024-06-28 00:46:26,274 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M065, evaluated
2024-06-28 00:46:32,242 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M077, molecular_function, proteins 1676, annotations 19163, replaced alt. ids 0
2024-06-28 00:46:49,276 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M044, cellular_component, proteins 1159, annotations 200808, replaced alt. ids 0
2024-06-28 00:46:53,806 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M033, evaluated
2024-06-28 00:46:54,608 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M137, cellular_component, proteins 1068, annotations 14560, replaced alt. ids 0
2024-06-28 00:46:54,674 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M137, evaluated
2024-06-28 00:46:56,098 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M102, cellular_component, proteins 9, annotations 619, replaced alt. ids 0
2024-06-28 00:47:11,901 [INFO ] Prediction: /dat

2024-06-28 00:52:59,541 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M088, cellular_component, proteins 1142, annotations 89102, replaced alt. ids 0
2024-06-28 00:53:15,923 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M088, cellular_component, proteins 1166, annotations 85321, replaced alt. ids 0
2024-06-28 00:53:16,390 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M110, evaluated
2024-06-28 00:53:16,537 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M031, biological_process, proteins 804, annotations 22913, replaced alt. ids 0
2024-06-28 00:53:22,288 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M012, evaluated
2024-06-28 00:53:25,650 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M088, cellular_component, proteins 1142, annotations 89102, replaced alt. ids 0
2024-06-28 00:53:28,856 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M044, evaluated
2024-06-28 00:53:30,153 [INFO ] Prediction: /

2024-06-28 01:00:57,239 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M061, evaluated
2024-06-28 01:00:57,552 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 01:00:57,596 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M124, not evaluated
2024-06-28 01:01:05,432 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M057, cellular_component, proteins 1166, annotations 84385, replaced alt. ids 0
2024-06-28 01:01:18,413 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M061, evaluated
2024-06-28 01:01:18,650 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 01:01:18,686 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M124, not evaluated
2024-06-28 01:01:20,649 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M091, evaluated
2024-06-28 01:01:22,453 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M061, cellular_component, proteins 1134, annotations 19219, 

2024-06-28 01:06:44,254 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M062, evaluated
2024-06-28 01:06:45,338 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M133, evaluated
2024-06-28 01:06:46,864 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M062, cellular_component, proteins 1134, annotations 19219, replaced alt. ids 0
2024-06-28 01:06:52,473 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M072, cellular_component, proteins 1142, annotations 21022, replaced alt. ids 0
2024-06-28 01:07:17,978 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M056, molecular_function, proteins 4698, annotations 2475415, replaced alt. ids 0
2024-06-28 01:07:19,955 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M072, evaluated
2024-06-28 01:07:32,012 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M072, evaluated
2024-06-28 01:07:35,864 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M019, cellular_compone

2024-06-28 01:13:20,095 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M052, biological_process, proteins 1316, annotations 57608, replaced alt. ids 0
2024-06-28 01:13:35,491 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M012, cellular_component, proteins 15884, annotations 725645, replaced alt. ids 0
2024-06-28 01:13:35,518 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M095, molecular_function, proteins 1961, annotations 181873, replaced alt. ids 0
2024-06-28 01:13:52,516 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, molecular_function, proteins 11843, annotations 784543, replaced alt. ids 37210
2024-06-28 01:14:06,738 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M038, cellular_component, proteins 1166, annotations 46097, replaced alt. ids 0
2024-06-28 01:14:12,759 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M045, molecular_function, proteins 2014, annotations 276334, replaced alt. ids 0
2024-0

2024-06-28 01:21:05,399 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M052, evaluated
2024-06-28 01:21:05,938 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M107, evaluated
2024-06-28 01:21:12,002 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M129, cellular_component, proteins 784, annotations 17960, replaced alt. ids 0
2024-06-28 01:21:13,410 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M082, evaluated
2024-06-28 01:21:14,513 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M132, biological_process, proteins 4, annotations 146, replaced alt. ids 0
2024-06-28 01:21:20,590 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M129, cellular_component, proteins 784, annotations 17960, replaced alt. ids 0
2024-06-28 01:21:22,191 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M020, cellular_component, proteins 2308, annotations 116980, replaced alt. ids 0
2024-06-28 01:21:27,969 [INFO ] Prediction: /data/

2024-06-28 01:28:48,764 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M086, biological_process, proteins 1230, annotations 91823, replaced alt. ids 0
2024-06-28 01:28:53,880 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M023, evaluated
2024-06-28 01:28:59,949 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M005, evaluated
2024-06-28 01:29:04,094 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M045, evaluated
2024-06-28 01:29:07,215 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M023, evaluated
2024-06-28 01:29:15,404 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M081, cellular_component, proteins 2308, annotations 219274, replaced alt. ids 7543
2024-06-28 01:29:22,852 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M120, evaluated
2024-06-28 01:29:28,938 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M086, biological_process, proteins 1230, annotations 91823, replaced alt. ids 0
2024

2024-06-28 01:35:04,497 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M015, cellular_component, proteins 1119, annotations 39230, replaced alt. ids 0
2024-06-28 01:35:19,069 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M015, evaluated
2024-06-28 01:35:20,205 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M058, cellular_component, proteins 917, annotations 12436, replaced alt. ids 0
2024-06-28 01:35:20,668 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, evaluated
2024-06-28 01:35:25,066 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M080, cellular_component, proteins 2308, annotations 209845, replaced alt. ids 7732
2024-06-28 01:35:33,821 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M080, evaluated
2024-06-28 01:35:40,901 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M049, cellular_component, proteins 2308, annotations 149576, replaced alt. ids 0
2024-06-28 01:35:44,819 [INFO ] Predicti

2024-06-28 01:43:10,102 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M035, evaluated
2024-06-28 01:43:20,303 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M037, evaluated
2024-06-28 01:43:28,012 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M046, evaluated
2024-06-28 01:43:55,157 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M037, cellular_component, proteins 1166, annotations 46097, replaced alt. ids 0
2024-06-28 01:43:58,519 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M018, molecular_function, proteins 12639, annotations 1373288, replaced alt. ids 0
2024-06-28 01:44:00,970 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 01:44:00,985 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M084, not evaluated
2024-06-28 01:44:08,673 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M040, evaluated
2024-06-28 01:44:11,357 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/

2024-06-28 01:50:48,940 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M004, cellular_component, proteins 1142, annotations 131881, replaced alt. ids 0
2024-06-28 01:50:54,631 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M004, cellular_component, proteins 1142, annotations 131881, replaced alt. ids 0
2024-06-28 01:50:54,637 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M051, evaluated
2024-06-28 01:50:55,349 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M137, evaluated
2024-06-28 01:50:56,880 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M102, cellular_component, proteins 33, annotations 1920, replaced alt. ids 0
2024-06-28 01:51:03,786 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M138, evaluated
2024-06-28 01:51:22,470 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M033, evaluated
2024-06-28 01:51:23,704 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M137, cellular_component,

2024-06-28 01:57:31,615 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 01:57:31,617 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M025, not evaluated
2024-06-28 01:57:32,826 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M042, cellular_component, proteins 985, annotations 14834, replaced alt. ids 0
2024-06-28 01:57:41,061 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M102, evaluated
2024-06-28 01:57:42,393 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M045, evaluated
2024-06-28 01:57:42,941 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M042, evaluated
2024-06-28 01:57:44,340 [WARNI] Empty prediction! Check format or overlap with ground truth
2024-06-28 01:57:44,441 [WARNI] Prediction: /data/yisupeng/sharing/cafa4/all_models/M083, not evaluated
2024-06-28 01:57:49,205 [INFO ] Prediction: /data/yisupeng/sharing/cafa4/all_models/M042, evaluated
2024-06-28 01:58:01,379 [INFO ] Prediction: /

## Test the figures

In [None]:
[dir_list = os.listdir(results_path)
    
    [cumulate = True
    add_extreme_points = True
    coverage_threshold = 0.3
    axis_title_dict = {'pr': 'Precision', 'rc': 'Recall', 'f': 'F-score', 'pr_w': 'Weighted Precision', 'rc_w': 'Weighted Recall', 'f_w': 'Weighted F-score', 'mi': 'Misinformation (Unweighted)', 'ru': 'Remaining Uncertainty (Unweighted)', 'mi_w': 'Misinformation', 'ru_w': 'Remaining Uncertainty', 's': 'S-score', 'pr_micro': 'Precision (Micro)', 'rc_micro': 'Recall (Micro)', 'f_micro': 'F-score (Micro)', 'pr_micro_w': 'Weighted Precision (Micro)', 'rc_micro_w': 'Weighted Recall (Micro)', 'f_micro_w': 'Weighted F-score (Micro)'}
    ontology_dict = {'biological_process': 'BPO', 'molecular_function': 'MFO', 'cellular_component': 'CCO'}
    
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    
    dir_list.remove('bpo_all_type3')
    for file in dir_list:
        df_file = results_path + file +"/evaluation_all.tsv"
        df = pd.read_csv(df_file, sep="\t")
        out_folder = out_path + file
        if not os.path.exists(out_folder):
            os.mkdir(out_folder)
            
        
        df = pd.read_csv(df_file, sep="\t")
        
        # Set method information (optional)
        if names_file is None:
            df['group'] = df['filename']
            df['label'] = df['filename']
            df['is_baseline'] = False
        else:
            methods = pd.read_csv(names_file, sep = "\t", header=0)
            df = pd.merge(df, methods, on='filename', how='left')
            df['group'].fillna(df['filename'], inplace=True)
            df['label'].fillna(df['filename'], inplace=True)
            if 'is_baseline' not in df:
                df['is_baseline'] = False
            else:
                df['is_baseline'].fillna(False, inplace=True)
            # print(methods)
        #df = df.drop(columns='filename').set_index(['group', 'label', 'ns', 'tau'])
        df = df.set_index(['group_unique', 'label', 'ns', 'filename','tau'])
        
        # Filter by coverage
        df = df[df['cov'] >= coverage_threshold]
        
        # Assign colors based on group
        cmap = plt.get_cmap('tab20')
        df['colors'] = df.index.get_level_values('group_unique')
        df['colors'] = pd.factorize(df['colors'])[0]
        df['colors'] = df['colors'].apply(lambda x: cmap.colors[x % len(cmap.colors)])
        
        index_best = df.groupby(level=['group_unique', 'ns'])[metric].idxmax() if metric in ['f', 'f_w', 'f_micro', 'f_micro_w'] else df.groupby(['group_unique', 'ns'])[metric].idxmin()
        
        # Filter the dataframe for the best methods
        df_methods = df.reset_index('tau').loc[[ele[:-1] for ele in index_best], ['tau', 'cov', 'colors'] + cols + [metric]].sort_index()

        # Makes the curves monotonic. Cumulative max on the last column of the cols variable, e.g. "pr" --> precision
        if cumulate:
            if metric in ['f', 'f_w', 'f_micro', 'f_micro_w']:
                df_methods[cols[-1]] = df_methods.groupby(level=['label', 'ns'])[cols[-1]].cummax()
            else:
                df_methods[cols[-1]] = df_methods.groupby(level=['label', 'ns'])[cols[-1]].cummin()


        # Save to file
        df_methods.drop(columns=['colors']).to_csv('{}/fig_{}.tsv'.format(out_folder, metric), float_format="%.3f", sep="\t")
        
        # Add first last points to precision and recall curves to improve APS calculation
        def add_points(df_):
            df_ = pd.concat([df_.iloc[0:1], df_])
            df_.iloc[0, df_.columns.get_indexer(['tau', cols[0], cols[1]])] = [0, 1, 0]  # tau, rc, pr
            df_ = pd.concat([df_, df_.iloc[-1:]])
            df_.iloc[-1, df_.columns.get_indexer(['tau', cols[0], cols[1]])] = [1.1, 0, 1]
            return df_

        if metric.startswith('f') and add_extreme_points:
            df_methods = df_methods.reset_index().groupby(['group_unique', 'label', 'ns'], as_index=False).apply(add_points).set_index(['group_unique', 'label', 'ns'])
        
        # Filter the dataframe for the best method and threshold
        df_best = df.loc[index_best, ['cov', 'colors'] + cols + [metric]]
        
        # Calculate average precision score 
        if metric.startswith('f'):
            df_best['aps'] = df_methods.groupby(level=['group_unique', 'label', 'ns'])[[cols[0], cols[1]]].apply(lambda x: (x[cols[0]].diff(-1).shift(1) * x[cols[1]]).sum())

        # Calculate the max coverage across all thresholds
        df_best['max_cov'] = df_methods.groupby(level=['group_unique', 'label', 'ns'])['cov'].max()
        
        # Set a label column for the plot legend
        df_best['label'] = df_best.index.get_level_values('label')
        if 'aps' not in df_best.columns:
            df_best['label'] = df_best.agg(lambda x: f"{x['label']} ({metric.upper()}={x[metric]:.3f} C={x['max_cov']:.3f})", axis=1)
        else:
            df_best['label'] = df_best.agg(lambda x: f"{x['label']} ({metric.upper()}={x[metric]:.3f} APS={x['aps']:.3f} C={x['max_cov']:.3f})", axis=1)
        
        # Generate the figures
        plt.rcParams.update({'font.size': 22, 'legend.fontsize': 18})

        # F-score contour lines
        x = np.arange(0.01, 1, 0.01)
        y = np.arange(0.01, 1, 0.01)
        X, Y = np.meshgrid(x, y)
        Z = 2 * X * Y / (X + Y)

        
        for ns, df_g in df_best.groupby(level='ns'):
            fig, ax = plt.subplots(figsize=(15, 15))

             # Contour lines. At the moment they are provided only for the F-score
            if metric.startswith('f'):
                CS = ax.contour(X, Y, Z, np.arange(0.1, 1.0, 0.1), colors='gray')
                ax.clabel(CS, inline=True) #, fontsize=10)

            cnt = 0
            # Iterate methods
            for i, (index, row) in enumerate(df_g.sort_values(by=[metric, 'max_cov'], ascending=[False if metric.startswith('f') else True, False]).iterrows()):
                
                #data = df_methods.loc[index[:-1]]
                data = df_methods.loc[index[:-2]]
                print(row[cols[0]], row[cols[1]])

                # Precision-recall or mi-ru curves
                ax.plot(data[cols[0]], data[cols[1]], color=row['colors'], label=row['label'], lw=2, zorder=500-i)

                # F-max or S-min dots
                ax.plot(row[cols[0]], row[cols[1]], color=row['colors'], marker='o', markersize=12, mfc='none', zorder=1000-i)
                ax.plot(row[cols[0]], row[cols[1]], color=row['colors'], marker='o', markersize=6, zorder=1000-i)

                cnt+=1
                if n_curves and cnt >= n_curves:
                    break
                
            # Set axes limit
            if metric.startswith('f'):
                plt.xlim(0, 1)
                plt.ylim(0, 1)

            # plt.xlim(0, max(1, df_best.loc[:,:,ns,:][cols[0]].max()))
            # plt.ylim(0, max(1, df_best.loc[:,:,ns,:][cols[1]].max()))

            # Set titles
            ax.set_title(file)
            ax.set_xlabel(axis_title_dict[cols[0]], labelpad=20)
            ax.set_ylabel(axis_title_dict[cols[1]], labelpad=20)

            # Legend
            # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            leg = ax.legend(markerscale=6, title=file)
            for legobj in leg.get_lines():
                legobj.set_linewidth(10.0)
                
            leg.set_bbox_to_anchor((1.05, 1))  

            # Save figure on disk
            plt.savefig("{}/fig_{}_{}.png".format(out_folder, metric, ns), bbox_inches='tight', dpi=300, transparent=True)
            # plt.clf()



]]

## Test Clara's function against Damiano's parser

In [None]:
ont_file = '/data/rashika/CAFA4/uniprot/go_2024_03_28/go-basic.obo' 
data = '/data/rashika/CAFA4/extracted_goa/t1_preprocessed.csv'

In [None]:
# Clara's

In [None]:
ont_graph = clean_ontology_edges(obonet.read_obo(ont_file))
subontologies = {aspect: fetch_aspect(ont_graph, roots[aspect]) for aspect in roots}

ann = pd.read_csv(data, sep="\t")
ann.columns = ['EntryID', 'term', 'aspect']
aspect_mapping = {
    'C': 'CCO',
    'F': 'MFO',
    'P': 'BPO'}
    
ann['aspect'] = ann['aspect'].map(aspect_mapping)
ann_prop = propagate_terms(ann, subontologies)

In [None]:
display(ann_prop)

In [None]:
# Damiano's

In [None]:
ia = None
no_orphans = False
# Parse the OBO file and creates a different graphs for each namespace
ontologies = obo_parser(ont_file, ("is_a", "part_of"), ia, not no_orphans)

# Parse ground truth file
#gt = gt_parser(data, ontologies)

In [None]:
x = gt['biological_process']

In [None]:
np.size(gt['biological_process'].matrix, 0)

In [None]:
np.size(gt['biological_process'].matrix, 1)

In [None]:
ontologies[ns].terms_dict

In [None]:
ontologies = obo_parser(ont_file, valid_rel=("is_a", "part_of"), ia_file=None, orphans=True)
ground_truth = gt_parser(data, ontologies)

In [None]:

def gt_as_df(gt, ontologies):
        dfs = []
        term_info = []
        for ns in ontologies:
            for index, p_id in enumerate(gt[ns].ids):
                GO_terms = list(ontologies[ns].terms_dict.keys())
                GO_terms = pd.DataFrame(GO_terms, columns = ['term'])
                GO_terms_In_p = gt[ns].matrix[index]==True
                GO_terms = GO_terms.loc[GO_terms_In_p]
                GO_terms['aspect'] = ns
                GO_terms['EntryID'] = p_id
                dfs.append(GO_terms)
        dfs = pd.concat(dfs)
        dfs = dfs.loc[:, ["EntryID", "term", "aspect"]].copy()
        return dfs

# Example usage:

#ground_truth = ground_truth['biological_process']


dfs = gt_as_df(ground_truth,ontologies)
    #for protein_id, term_id in term_info:
        #print("Protein ID: {}, Term ID: {}".format(protein_id, term_id))

In [None]:
dfs

In [None]:
aspect_mapping = {
    'cellular_component' :'CCO',
    'molecular_function': 'MFO',
    'biological_process': 'BPO'}
    
dfs['aspect'] = dfs['aspect'].map(aspect_mapping)
display(dfs)

In [None]:
# Find elements in df1 that are not in df2
df1_unique = pd.concat([dfs, ann_prop]).drop_duplicates(keep=False)
df1_unique = df1_unique.dropna()  # Drop rows with NaN values, if any

# Find elements in df2 that are not in df1
df2_unique = pd.concat([ann_prop, dfs]).drop_duplicates(keep=False)
df2_unique = df2_unique.dropna()  # Drop rows with NaN values, if any


In [None]:
df1_unique

In [None]:
df2_unique

In [None]:
gt_info = get_term_info(x)

In [None]:
gt_info 
i = 0
terms = []
for term_id, score, aspect in gt_info:
    print("Term ID: {}, Score: {}, Aspect: {}".format(term_id, score, aspect))
    i+=1
    terms.append(term_id)
i

In [None]:
len(np.unique(terms))

In [None]:
gt['biological_process']

In [None]:
ls ../../CAFA-evaluator/src/cafaeval/

Bootstrapping Development on sample


In [None]:
ont_file = '/data/rashika/CAFA4/uniprot/goa_2020_Jan_03/go-basic.obo' # data-version: releases/2020-01-01
#pred_dir = '/data/rashika/CAFA4/pred_sample/' # 20 methods
pred_dir ='/home/rashika/CAFA4/one/' # 1 method
BM_GO_file = '/data/rashika/CAFA4/eval/BM_GO/cco_all_type1.txt'
IA_file =  "/data/rashika/CAFA4/eval/IA/IA.txt"
out_dir = "/data/rashika/CAFA4/test_b"


cmd = "python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py "+ ont_file +" "+ pred_dir + " " + BM_GO_file + " -out_dir " + out_dir + ' -ia ' + IA_file + " -prop max -th_step 0.01  -no_orphans -b 10" + " &"
print(cmd)
#os.system(cmd)

In [None]:
/Users/rashi/Documents/Academics/Research/CAFA4_eval/Damiano_code/test_data/go-basic.obo /Users/rashi/Documents/Academics/Research/CAFA4_eval/Damiano_code/test_data/one/ /Users/rashi/Documents/Academics/Research/CAFA4_eval/Damiano_code/test_data/cco_all_type1.txt -out_dir /Users/rashi/Documents/Academics/Research/CAFA4_eval/Damiano_code/output -ia /Users/rashi/Documents/Academics/Research/CAFA4_eval/Damiano_code/test_data/IA.txt -prop max -th_step 0.01  -no_orphans -b 10 

In [None]:
roots = {'BPO': 'GO:0008150', 'CCO': 'GO:0005575', 'MFO': 'GO:0003674'}

work_dir = "/data/rashika/CAFA4/"
t0_mapped_path = work_dir + "mapped/t0.csv"
t1_mapped_path = work_dir + "/mapped/t1.csv"

t0_ont_file = '/data/rashika/CAFA4/uniprot/goa_2020_Jan_03/go-basic.obo' # data-version: releases/2020-01-01
t0_ont_graph = clean_ontology_edges(obonet.read_obo(t0_ont_file)) 
    
t1_mapped_ann =  "/data/rashika/CAFA4/mapped/t1_mapped.csv"
t1_ont_graph = clean_ontology_edges(obonet.read_obo( "/data/rashika/CAFA4/uniprot/goa_2024-02-09/go-basic.obo")) # data-version: releases/2024-01-17
    
#Prop t0 and t1 in their respective ontologies
t0_prop = process_raw_annot(t0_mapped_path, t0_ont_graph, roots, remove_roots = False)
t1_prop = process_raw_annot(t1_mapped_path, t1_ont_graph, roots, remove_roots = False)
    
# Keep common terms
t0_common, t1_common =  keep_common_go_terms(t0_prop, t1_prop, t0_ont_graph, t1_ont_graph)
t0_common.to_csv('/data/rashika/CAFA4/common/t0.tsv', sep = '\t', header = False, index = False)
t1_common.to_csv('/data/rashika/CAFA4/common/t1.tsv', sep = '\t', header = False, index = False)

In [None]:
# Obtain the plots
# Results of 5 methods
data_path = result_path
plots_path = "/data/rashika/CAFA4/eval/" + "plots/"
plots_path_f_w = plots_path+'f_w/'
plots_path_f = plots_path+'f/'
plots_path_f_micro_w = plots_path+'f_micro_w/'
plots_path_s_w = plots_path+'s_w/'
register = '/data/rashika/CAFA4/file_map.tsv'

metric, cols = ('f_w', ['rc_w', 'pr_w'])
#create_plots(results_path, metric, cols,out_path='/home/rashika/CAFA4/eval/plots/', n_curves = None, names_file = None):
create_plots(data_path, metric, cols, out_path = plots_path_f_w, n_curves = 10, names_file = register)

metric, cols = ('f', ['rc', 'pr'])
create_plots(data_path, metric, cols, out_path = plots_path_f, n_curves = 10, names_file = register)

metric, cols =  ('f_micro_w', ['rc_micro_w', 'pr_micro_w'])
create_plots(data_path, metric, cols, out_path = plots_path_f_micro_w, n_curves = 10, names_file = register)

metric, cols = ('s_w', ['ru_w', 'mi_w'])
create_plots(data_path, metric, cols, out_path = plots_path_s_w, n_curves = 10, names_file = register)



In [None]:
# Bootstrapping

In [None]:
roots = {'BPO': 'GO:0008150', 'CCO': 'GO:0005575', 'MFO': 'GO:0003674'}

t0_ont_file = '/data/rashika/CAFA4/uniprot/goa_2020_Jan_03/go-basic.obo' # data-version: releases/2020-01-01
t0_ont_graph = clean_ontology_edges(obonet.read_obo(t0_ont_file)) 
shawn_t0_mapped_ann = "/data/rashika/CAFA4/CAFA4_gt/t0_mapped.csv"

t1_mapped_ann = "/data/rashika/CAFA4/CAFA4_gt/t1_mapped.csv"
t1_ont_graph = clean_ontology_edges(obonet.read_obo( "/data/rashika/CAFA4/uniprot/goa_2024-02-09/go-basic.obo")) # data-version: releases/2024-01-17

BM_path = "/home/rashika/CAFA4/eval/benchmarks/"
BM_GO_path = "/home/rashika/CAFA4/eval/benchmarks_GO/"

# Create BM lists
#t1_eval = create_bm_lists(shawn_t0_mapped_ann, t1_mapped_ann, t0_ont_graph, t1_ont_graph, roots, BM_path)




In [None]:
import os

In [None]:
def generate_image_code(directory):
        image_code = ''
        for root, dirs, files in os.walk(directory):
            for sub_dir in dirs:
                print(sub_dir)
                print(directory +sub_dir)
                for _, _, files_sub_dir in os.walk(directory +sub_dir):
                    for file in files_sub_dir:
                        print(str(file))
                        if file.endswith(".png"):
                            image_path = os.path.join(root, file)
                            folder = os.path.basename(root)
                            image_code += "\\begin{subfigure}[b]{0.3\\textwidth}\n"
                            image_code += "\\centering\n"
                            image_code += "\\includegraphics[width=\\textwidth]{" + image_path + "}\n"
                            image_code += "\\caption{Figure in \\texttt{" + folder + "}}\n"
                            image_code += "\\label{fig:" + folder + ":" + file + "}\n"
                            image_code += "\\end{subfigure}\n"
            return image_code



In [None]:
# Call function to generate image code for current directory
latex_image_code = generate_image_code('/home/rashika/CAFA4/eval/plots_ALL/f/')

In [None]:
latex_image_code