In [None]:
import glob
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
from headers import headers

In [None]:
tools_full_names = {'fs_10': 'Foldseek', 
                    'fs_exh': 'Foldseek (without prefilter)', 
                    'hmmscan_10': "HMMER", 
                    'hmmscan_exh': "HMMER (without prefilter)", 
                    'mm_10': "MMseqs", 
                    'mm_exh': "MMseqs (without prefilter)",
                    'reseek_10_fast': "Reseek (fast)", 
                    'reseek_10_sens': "Reseek (sensitive)",  
                    'reseek_exh': "Reseek (without prefilter)",
                    'tm_exh': "TM-align"}

In [None]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
# Blue, Orange, Green, Red, Purple

In [None]:
with open("../data/processed/f1_data.pkl", 'rb') as file:
    f1_data = pickle.load(file)

In [None]:
for tool in f1_data.keys():
    print(tool)
    print(f1_data[tool]["max_f1_evalue_bin"])
    #df = f1_data[tool]["precision_vs_recall"]
    #print(df.tail(1)["evalue_bin"])

In [None]:
print(f1_data["fs_10"]['f1_ci_lower'])
print(f1_data["fs_10"]['f1_ci_upper'])

In [None]:
f1_data["fs_10"].keys()

In [None]:
discrete_perf_dict = {x:add_performance4different_evalue_cutoffs(y) for x,y in method_df_dict.items() if "tm" not in x}

In [None]:
fig_dir = "../figures/"
os.makedirs(fig_dir, exist_ok=True)

In [None]:
plt.figure(figsize=(10, 6), dpi=300)

all_f1_scores = {}
for key, df in discrete_perf_dict.items():
    # Calculate F1 scores
    f1_scores = 2 * (df["precision"] * df["recall"]) / (df["precision"] + df["recall"])
    all_f1_scores[key] = f1_scores
    # Find the index of maximum F1 score
    max_f1_idx = f1_scores.idxmax()
    max_f1_value = f1_scores.max()
    
    # Get the precision and recall at max F1
    max_precision = df.loc[max_f1_idx, "precision"]
    max_recall = df.loc[max_f1_idx, "recall"]
    
    # Plot the precision-recall curve
    plt.plot(df["precision"], df["recall"], label=f'{tools_full_names[key]} (F1={max_f1_value:.3f})')
    
    # Highlight the best F1 point
    plt.scatter(max_precision, max_recall, marker='*', s=100, zorder=5)
    
    # Optional: Annotate the point
    plt.annotate(f'', 
                xy=(max_precision, max_recall),
                xytext=(5, 5), textcoords='offset points',
                fontsize=8, alpha=0.7)

plt.legend()
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.grid(True, alpha=0.3)
plt.savefig(f"{fig_dir}/precision_recall_split_vs_split.png")
plt.show()


In [None]:
prec_recall_evalue = {x: find_prec_recall_vs_evalues(y) for x,y in method_df_dict.items()}
prec_recall_evalue = {x: y[y["row_num"] >= 500] for x,y in prec_recall_evalue.items()}

plt.figure(figsize=(10, 6), dpi=300)

all_f1_scores = {}
for key, df in prec_recall_evalue.items():
    
    # Plot the precision-recall curve
    plt.plot(df["precision"], df["recall"], label=f'{tools_full_names[key]}')
    

plt.legend()
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.grid(True, alpha=0.3)
#plt.savefig(f"{fig_dir}/precision_recall_split_vs_split.png")
plt.show()

In [None]:
what to do:
    add_f1
    find the max f1 for each df and its e-value threshold
    interp recall from precision using each bootstrap
    and return the interpolated line
prec_recall_evalue["tm_exh"]

In [None]:
df = prec_recall_evalue["reseek_exh"]
df["f1"] = 2*df["precision"]*df["recall"]/(df["precision"] + df["recall"])

In [None]:
argmax_row = df["f1"].argmax()
print(argmax_row)

In [None]:
df.iloc[argmax_row]

In [None]:
df["f1"].max()