In [1]:
import numpy as np
import pandas as pd
import gzip
import csv
import io
import argparse

import subprocess
import multiprocessing
import time

import sys
sys.path.append("/home/rashika/CAFA4/InformationAccretion/")
sys.path.append("/home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/")
from parser import *
from ia import *
from make_benchmarks import *


import os
import shutil
import concurrent.futures

In [9]:
"""
    Read a the processed annotation file, map the primary ID to a mapping file, and keep the rows that can be mapped.

    Parameters:
    - processed_file_path: Path to the file containing processed GOA annotations
    - mapping_file: Path to the mapping file 
    - out_path: Path to the output file.
    """
def goa_to_CAFA4ID(processed_file_path, mapping_file, out_path):
    ann = pd.read_csv(processed_file_path, sep = '\t', header = 0)
    mapping = pd.read_csv(mapping_file, sep = '\t', header = 0)

    # Inner join the processed annotations and the mapping based on DB Object ID
    mapped =  pd.merge(ann, mapping, on='DB Object ID', how='inner')
    
    
    # Keep the required columns
    mapped = mapped[["CAFA_ID", "GO ID", "Aspect"]]
    
    # Write the mapped file to the out_path
    mapped.to_csv(out_path, sep = "\t", index=False, header=None)

In [5]:
def get_preprocess_cmd(gaf_path, out_path):
    cmd = [
    "python3",                 # Command to execute Python 3
    "preprocess_gaf.py",       # Script to run
    gaf_path,  # Path to input file
    "--highTP",
    "--out_path", out_path,        # Output path parameter
    #"--evidence_codes", "EXP", "IDA",   # Evidence codes parameter
    #"--extract_col_list", "DB Object ID", "Qualifier"  # Extract column list parameter
]
    return cmd

def run_process(command, log_file):
    print(" ".join(command))
    #with open(log_file, "w") as f:
    #    print(" ".join(command))
        #result = subprocess.run(" ".join(command), shell=True, stdout=f, stderr=subprocess.STDOUT)
        #if result.returncode != 0:
        #    print(f"Error running command: {' '.join(command)}. Check {log_file}")

In [13]:
if __name__ == "__main__":
    # Define commands and log file names
    work_dir = "/data/rashika/CAFA4/"
    
    t0_CAFA2_2014 = work_dir + 'uniprot/goa_2014-01-21/gene_association.goa_uniprot.127.gz'
    t0_CAFA3_2017 = work_dir + 'uniprot/goa_2017-01-17/goa_uniprot_all.gaf.162.gz'
    
    t0_CAFA2_2014_processed = work_dir + 'extracted_goa/goa_2014-01-21/preprocessed.csv'
    t0_CAFA3_2017_processed = work_dir + 'extracted_goa/goa_2017-01-17/preprocessed.csv'
    
    t0_CAFA2_2014_log = work_dir + 'log/goa_2014-01-21.txt'
    t0_CAFA3_2017_log = work_dir + 'log/goa_2017-01-17.txt'
    
    cmd_preprocess_t0_CAFA2_2014 = get_preprocess_cmd(t0_CAFA2_2014, t0_CAFA2_2014_processed)
    cmd_preprocess_t0_CAFA3_2017 = get_preprocess_cmd(t0_CAFA3_2017, t0_CAFA3_2017_processed)
    
    #run_process(cmd_preprocess_t0_CAFA2_2014, t0_CAFA2_2014_log)
    #run_process(cmd_preprocess_t0_CAFA3_2017, t0_CAFA3_2017_log)
    
    mapping_file_CAFA2 = "/data/common/CAFA4/h2h/AC2CAFA2ID.map"
    mapping_file_CAFA3 = "/data/common/CAFA4/h2h/AC2CAFA3ID.map"
    
    mapped_path_CAFA2 = work_dir + "mapped/CAFA2_IDs/2014-01-21/t0_CAFA2_2014.csv"
    mapped_path_CAFA3 = work_dir + "mapped/CAFA3_IDs/2017-01-17/t0_CAFA3_2017.csv"
    
    
    # Map to CAFA4 IDs 
    #goa_to_CAFA4ID(t0_CAFA2_2014_processed, mapping_file_CAFA2, mapped_path_CAFA2)
    #goa_to_CAFA4ID(t0_CAFA3_2017_processed, mapping_file_CAFA3, mapped_path_CAFA3)

## Write new code to generate Naive baseline


In [91]:
def get_naive_pred(gt_file, obo_file, out_file_path):
    # Parse the OBO file and creates a different graphs for each namespace
    ontologies = obo_parser(obo_file, ("is_a", "part_of"))

    # Parse ground truth file
    gt = gt_parser(gt_file, ontologies)
    
    ont_preds = []
    for aspect in ontologies.keys():
        IDs = gt[aspect].ids
        naive_score = sum(gt[aspect].matrix)/len(gt[aspect].matrix[0]) # Frequency of each GO term/Number of GO terms

        pred = [] # Initialise prediction dict

        for k in ontologies[aspect].terms_dict.keys():
            pred.append([k, naive_score[ontologies[aspect].terms_dict[k]['index']]])
        pred = pd.DataFrame(pred, columns = ['GO_term', 'score'])
        preds = [pred.assign(Protein = protein) for protein in IDs]
        ont_preds.append(pd.concat(preds, ignore_index=True))
    Final_pred = pd.concat(ont_preds, ignore_index=True)
    Final_pred = Final_pred[["Protein", "GO_term", 'score']]
    Final_pred.to_csv(os.path.join(out_file_path, 'naive_'+ gt_file.split("/")[-1]), sep = "\t", header = None, index = False)

In [94]:
cafa_file_map = {'cafa4': {'gt_file': '/data/rashika/CAFA4/mapped/2019-12-17/t0_2019.csv', 'obo_file': "/data/rashika/CAFA4/obo/go_2019_10_07/go-basic.obo", 'out_file_path': "/data/rashika/CAFA4/baselines/naive/cafa4/"}, 
            'cafa3': {'gt_file': "/data/rashika/CAFA4/mapped/CAFA3_IDs/2017-01-17/t0_CAFA3_2017.csv", 'obo_file': "/data/rashika/CAFA4/obo/go_2016-06-01/go-basic.obo", 'out_file_path': "/data/rashika/CAFA4/baselines/naive/cafa3/"}, 
            'cafa2': {'gt_file': "/data/rashika/CAFA4/mapped/CAFA2_IDs/2014-01-21/t0_CAFA2_2014.csv", 'obo_file': "/data/rashika/CAFA4/obo/go_2013-09-01/go-basic.obo", 'out_file_path': "/data/rashika/CAFA4/baselines/naive/cafa2/"}
           }

In [96]:
for cafa, files in cafa_file_map.items():
    print(files)
    get_naive_pred(files['gt_file'], files['obo_file'], files['out_file_path'])
    print('done', cafa)

{'gt_file': '/data/rashika/CAFA4/mapped/2019-12-17/t0_2019.csv', 'obo_file': '/data/rashika/CAFA4/obo/go_2019_10_07/go-basic.obo', 'out_file_path': '/data/rashika/CAFA4/baselines/naive/cafa4/'}
done cafa4
{'gt_file': '/data/rashika/CAFA4/mapped/CAFA3_IDs/2017-01-17/t0_CAFA3_2017.csv', 'obo_file': '/data/rashika/CAFA4/obo/go_2016-06-01/go-basic.obo', 'out_file_path': '/data/rashika/CAFA4/baselines/naive/cafa3/'}
done cafa3
{'gt_file': '/data/rashika/CAFA4/mapped/CAFA2_IDs/2014-01-21/t0_CAFA2_2014.csv', 'obo_file': '/data/rashika/CAFA4/obo/go_2013-09-01/go-basic.obo', 'out_file_path': '/data/rashika/CAFA4/baselines/naive/cafa2/'}
done cafa2
