In [None]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
data_path = "../data/processed/evalue_bins/"

In [None]:
file_paths = glob.glob(f"{data_path}/*.tsv")

In [None]:
tools = sorted(list(set(os.path.basename(x).split("_B")[0] for x in file_paths)))
#tools = ["cif_cut", "reseek", "mm"] #Change if needed

In [None]:
tools_paths = {x: sorted(glob.glob(f"{data_path}/{x}*.tsv")) for x in tools}

In [None]:
def calc_evalue_vs_fp(df):
    grouped = df.groupby("query")
    i = 0
    for grp_name, grp_df in grouped:    
        fps_instance = pd.concat([grp_df[["evalue_bin"]], grp_df[["fp_pfam", "fp_clan"]].cumsum()], axis=1).set_index("evalue_bin")
        if i == 0:
            fps_all = fps_instance
        else:
            fps_all = fps_all.add(fps_instance, fill_value=0)
        i += 1
    fps = fps_all.reset_index()
    fps["fp_pfam"] = fps["fp_pfam"]/i
    fps["fp_clan"] = fps["fp_clan"]/i
    fps["fp_pfam_log"] = np.log10(fps["fp_pfam"])
    fps["fp_clan_log"] = np.log10(fps["fp_clan"])
    return fps

In [None]:
tools_df = {}
for tool in tools:
    dfs = [pd.read_csv(x, sep="\t") for x in tools_paths[tool]]
    concat_df = pd.concat(dfs)
    concat_df["evalue_bin"] = concat_df["evalue_bin"]/np.log2(10) # This was added because the log2 of the e-values are stored in recent versions
    tools_df[tool] = concat_df

In [None]:
tools_name = {"cif_cut": "Foldseek", "reseek": "Reseek"}
eval_vs_fpepq = {tool: calc_evalue_vs_fp(tools_df[tool]) for tool in ["reseek", "cif_cut"]}

In [None]:
plt.figure(dpi=300)

def format_func(value, tick_number):  
    return f'$10^{{{int(value)}}}$'  

for tool, fps in eval_vs_fpepq.items():
    subdf = fps[(fps["evalue_bin"] >= -10) & (fps["evalue_bin"] <= 1)]
    plt.plot(subdf["evalue_bin"], subdf["fp_pfam_log"], label=tools_name[tool])
plt.plot([-3, -2, -1, 0, 1], [-3, -2, -1, 0, 1], label="Ideal", color='black', linestyle='dashed')
plt.xlabel('Reported e-value')  # Label for the x-axis  
plt.ylabel('FPEPQ')
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func)) 
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func)) 
plt.legend()
plt.savefig("../figures/fpepq_vs_evalue.png")
plt.show()