1. Make an intersection of the top 10 methods

In [11]:
import os
import pandas as pd

In [19]:
results_path = '/data/rashika/CAFA4/eval_final/eval_results/'
names_file = '/data/rashika/CAFA4/file_map.tsv'
coverage_threshold = 0.3
metric, cols = ('f_w', ['rc_w', 'pr_w'])

### Collect the top n (n = 10) methods based on each metric

In [129]:
def get_top_methods(results_path, metric = 'f', cols = ['rc_w', 'pr'], names_file = '/data/rashika/CAFA4/file_map.tsv', coverage_threshold = 0.3):
    dir_list = os.listdir(results_path)
    top_n = {}
    for ns in ['cellular_component', 'biological_process', 'molecular_function']:
        top_n[ns] = {}
    dir_list.remove('bpo_all_type3')
    for file in dir_list:
            df_file = results_path + file +"/evaluation_all.tsv"
            df = pd.read_csv(df_file, sep="\t")

            # Set method information (optional)
            if names_file is None:
                df['group'] = df['filename']
                df['label'] = df['filename']
                df['is_baseline'] = False
            else:
                methods = pd.read_csv(names_file, sep = "\t", header=0)
                df = pd.merge(df, methods, on='filename', how='left')
                df['group'].fillna(df['filename'], inplace=True)
                df['label'].fillna(df['filename'], inplace=True)
                if 'is_baseline' not in df:
                    df['is_baseline'] = False
                else:
                    df['is_baseline'].fillna(False, inplace=True)
                # print(methods)
            #df = df.drop(columns='filename').set_index(['group', 'label', 'ns', 'tau'])
            df = df.set_index(['group_unique', 'label', 'ns', 'filename','tau'])

            # Filter by coverage
            df = df[df['cov'] >= coverage_threshold]


            index_best = df.groupby(level=['group_unique', 'ns'])[metric].idxmax() if metric in ['f', 'f_w', 'f_micro', 'f_micro_w'] else df.groupby(['group_unique', 'ns'])[metric].idxmin()

            # Filter the dataframe for the best methods
            df_methods = df.reset_index('tau').loc[[ele[:-1] for ele in index_best], ['tau', 'cov'] + [metric]].sort_index()

            # Filter the dataframe for the best method and threshold
            df_best = df.loc[index_best, ['cov'] + cols + [metric]]

            # Calculate the max coverage across all thresholds
            df_best['max_cov'] = df_methods.groupby(level=['group_unique', 'label', 'ns'])['cov'].max()

            n = 10
            for ns, df_g in df_best.groupby(level='ns'):
                top_n[ns][file] = df_g.sort_values(by=[metric, 'max_cov'], ascending=[False if metric.startswith('f') else True, False])[:n]
                top_n[ns][file] = list(top_n[ns][file].reset_index('filename')['filename'])
                #print(top_n[ns][fi`le].columns)
    return top_n


In [134]:
metric_cols = {}
metric_cols['f_w'] = ['rc_w', 'pr_w']
metric_cols['f'] = ['rc', 'pr']
metric_cols['f_micro_w'] = ['rc_micro_w', 'pr_micro_w']
metric_cols['f_micro'] = ['rc_micro', 'pr_micro']
metric_cols['s_w'] = ['ru_w', 'mi_w']

top_n_metric = {}
for metric in metric_cols.keys():
    top_n_metric[metric] = get_top_methods(results_path, metric, metric_cols[metric], names_file = '/data/rashika/CAFA4/file_map.tsv', coverage_threshold = 0.3)

## Collect the union of the methods that show up in top 10

In [142]:
top_methods = set()
for metric in metric_cols.keys():
    for ns in ['cellular_component', 'biological_process', 'molecular_function']:
        for key in top_n_metric[metric][ns].keys():
            top_methods = top_methods.union(set(top_n_metric[metric][ns][key]))

In [191]:
top_n_metric.keys()

dict_keys(['f_w', 'f', 'f_micro_w', 'f_micro', 's_w'])

Get the 'benchmark_wise' union of methods, add blast and naive to methods in each list

In [220]:
Benchmarks = ['cco_all_type1', 'cco_all_type2', 'cco_all_type3', 'cco_all_type12', 'bpo_all_type1', 'bpo_all_type2', 'bpo_all_type3', 'bpo_all_type12', 'mfo_all_type1', 'mfo_all_type2', 'mfo_all_type3', 'mfo_all_type12']
top_by_bm = {}
for bm in Benchmarks:
    top_by_bm[bm] = []
    
for metric in top_n_metric.keys():
    for aspect in top_n_metric[metric].keys():
        for bm in top_n_metric[metric][aspect].keys():
            if top_n_metric[metric][aspect][bm]:
                top_by_bm[bm] = list(set(top_by_bm[bm]).union(set(top_n_metric[metric][aspect][bm])).union(set(['blast', 'naive'])))

In [225]:
x = 0
unique_winner = set()
for bm in Benchmarks:
    unique_winners = unique_winners.union(set(top_by_bm[bm]))
    x +=len(top_by_bm[bm])
x

239

In [226]:
len(unique_winners)

52

Methods to be bootstrapped = union of top methods and blast, naive

Copy the files4bs to a different directory, so that bootstrapping can be done on the entire directory

In [234]:
pred_dir = "/data/yisupeng/sharing/cafa4/all_models/"
bs_dir = '/data/rashika/CAFA4/winner_methods/'
for bm in Benchmarks:
    bm_dir = bs_dir+bm
    if not os.path.exists(bm_dir):
        os.mkdir(bm_dir)
    for method in top_by_bm[bm]:
        cmd = "cp " + pred_dir + method + " " + bm_dir
        print(cmd)
        os.system(cmd)

cp /data/yisupeng/sharing/cafa4/all_models/M120 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M038 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M088 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M066 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M112 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M118 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M115 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/naive /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M086 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M050 /data/rashika/CAFA4/winner_methods/cco_all_type1
cp /data/yisupeng/sharing/caf

cp /data/yisupeng/sharing/cafa4/all_models/blast /data/rashika/CAFA4/winner_methods/cco_all_type12
cp /data/yisupeng/sharing/cafa4/all_models/M024 /data/rashika/CAFA4/winner_methods/cco_all_type12
cp /data/yisupeng/sharing/cafa4/all_models/M040 /data/rashika/CAFA4/winner_methods/cco_all_type12
cp /data/yisupeng/sharing/cafa4/all_models/M006 /data/rashika/CAFA4/winner_methods/cco_all_type12
cp /data/yisupeng/sharing/cafa4/all_models/M022 /data/rashika/CAFA4/winner_methods/cco_all_type12
cp /data/yisupeng/sharing/cafa4/all_models/M088 /data/rashika/CAFA4/winner_methods/bpo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M038 /data/rashika/CAFA4/winner_methods/bpo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M066 /data/rashika/CAFA4/winner_methods/bpo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M118 /data/rashika/CAFA4/winner_methods/bpo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M082 /data/rashika/CAFA4/winner_methods/bpo_all_type1
cp /data/yisupeng/sharin

cp /data/yisupeng/sharing/cafa4/all_models/M043 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M119 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M074 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M110 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M068 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/blast /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M112 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M006 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M131 /data/rashika/CAFA4/winner_methods/mfo_all_type1
cp /data/yisupeng/sharing/cafa4/all_models/M088 /data/rashika/CAFA4/winner_methods/mfo_all_type2
cp /data/yisupeng/sharing/caf

In [235]:
def run_bootstrap(BM_GO_path, ont_file, pred_dir = '/data/rashika/CAFA4/winner_methods/', IA_file = '/data/rashika/CAFA4/eval/IA/IA.txt', result_path = '/data/rashika/CAFA4/eval_final/bootstrap', log_path = '/home/rashika/CAFA4/eval/log/', thresh_step = 0.001):
    dir_list = os.listdir(BM_GO_path) # out_path is the path to the folder containing the target GO sets

    if not os.path.exists(result_path):
        os.mkdir(result_path)
        
    if not os.path.exists(log_path):
        os.mkdir(log_path)
        
    for file in dir_list:

        print("Evaluating: " + file)
        out_dir = result_path + file.split(".")[0] + '/'
        pred_dir_bs = pred_dir + file.split(".")[0] + '/'
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        log_dir = log_path + file.split(".")[0] + '/'
        if not os.path.exists(log_dir):
            os.mkdir(log_dir)
        log_file = log_dir + 'run_bs.log'
        
        # cafaeval go-basic.obo prediction_dir test_terms.tsv -ia IA.txt -prop fill -norm cafa -th_step 0.001 -max_terms 500
        #cmd = 'cafaeval /data/yisupeng/sharing/cafa4/gene_ontology_edit.obo.2020-01-01 /data/yisupeng/sharing/cafa4/all_models/ ' + '/data/yisupeng/sharing/cafa4/t1_truth.csv' + ' -out_dir '+ out_dir + ' -prop max -th_step 0.01  -no_orphans -log_level info > '+ log_path + file.split(".")[0] + '.log'+ ' &'
        #cmd = 'cafaeval '+ ont_file + pred_dir + BL_GO_path+file +' -out_dir '+ out_dir + ' -prop max -th_step 0.01  -no_orphans -log_level info > '+ log_path + file.split(".")[0] + '.log'+ ' &'
        #With IA
        cmd = "python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py "+ ont_file +" "+ pred_dir_bs + " " + BM_GO_path+file + " -out_dir " + out_dir + ' -ia ' + IA_file + " -prop max -th_step " + str(thresh_step) + " -no_orphans > "+ log_file+  " &"
        #Without IA
        #cmd = "python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py "+ ont_file +" "+ pred_dir + " " + BM_GO_path+file + " -out_dir " + out_dir + " -prop max -th_step 0.01  -no_orphans " + " &"
        
        
        print(cmd)
        #os.system(cmd)

In [185]:
eval_path = '/data/rashika/CAFA4/eval_final/'
BM_GO_path = eval_path + "BM_GO/"
pred_dir = '/data/rashika/CAFA4/winner_models/'
t_minus_ont_file =  "/data/rashika/CAFA4/uniprot/go_2019_10_07/go-basic.obo"
IA_file =  eval_path + "IA.txt"
bs_result_path = eval_path + 'bootstrap/'
thresh_step = 0.001
B = 3
run_bootstrap(BM_GO_path, bs_dir, t_minus_ont_file, IA_file = IA_file, result_path = bs_result_path,  thresh_step = thresh_step, B = B)

['bpo_all_type1.txt', 'cco_all_type1.txt', 'mfo_all_type1.txt', 'bpo_all_type2.txt', 'cco_all_type2.txt', 'mfo_all_type2.txt', 'bpo_all_type3.txt', 'cco_all_type3.txt', 'mfo_all_type3.txt', 'bpo_all_type12.txt', 'cco_all_type12.txt', 'mfo_all_type12.txt']
/data/rashika/CAFA4/eval_final/bootstrap/
Evaluating: bpo_all_type1.txt
python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py /data/rashika/CAFA4/uniprot/go_2019_10_07/go-basic.obo /data/rashika/CAFA4/winner_models/ /data/rashika/CAFA4/eval_final/BM_GO/bpo_all_type1.txt -out_dir /data/rashika/CAFA4/eval_final/bootstrap/bpo_all_type1/ -ia /data/rashika/CAFA4/eval_final/IA.txt -prop max -th_step 0.001 -no_orphans -b 3 > /home/rashika/CAFA4/eval/log/bpo_all_type1/run.log &
Evaluating: cco_all_type1.txt
python3 /home/rashika/CAFA4/CAFA-evaluator/src/cafaeval/__main__.py /data/rashika/CAFA4/uniprot/go_2019_10_07/go-basic.obo /data/rashika/CAFA4/winner_models/ /data/rashika/CAFA4/eval_final/BM_GO/cco_all_type1.txt -out_dir /da

2024-06-27 23:12:47,024 [INFO ] Prediction: /data/rashika/CAFA4/winner_models/M038, evaluated
2024-06-27 23:13:37,957 [INFO ] Prediction: /data/rashika/CAFA4/winner_models/M088, biological_process, proteins 1318, annotations 457626, replaced alt. ids 0
