In [None]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
default_and_bitscore_paths = [f"../tmp/sffp_rescoring/{tool}.tsv" for tool in ["foldseek", "foldseek_bitscore", "reseek", "reseek_bitscore"]]
sffp_dict = {os.path.basename(x).replace(".tsv", ""): pd.read_csv(x, sep="\t") for x in default_and_bitscore_paths}
#sffp_dict = {x:y for x,y in sffp_dict.items() if "_ml_exc_b1.tsv" in x}

In [None]:
def cum_sffp(sffp_df, col2sortby):
    """Receiving a dataframe of SFFP, it will return the cumulative SFFP plot"""
    sorted_df = sffp_df.sort_values(by=col2sortby, ascending=False)
    sorted_df = sorted_df.reset_index(drop=True)
    sorted_df["fraction_of_queries"] = (sorted_df.index + 1)/len(sorted_df)
    return sorted_df

In [None]:
name_mapping = {"foldseek": "Foldseek", "foldseek_bitscore": "Foldseek (PS)", "reseek": "Reseek", "reseek_bitscore": "Reseek (PS)"}

In [None]:
plt.figure(figsize=(10, 6), dpi=300)

col2sortby = "sffp_pfam"
for tool, tool_df in sffp_dict.items():
    
    cum_df = cum_sffp(tool_df, col2sortby)
    x, y = cum_df["fraction_of_queries"], cum_df[col2sortby]
    auc = np.trapz(y, x)
    plt.plot(cum_df["fraction_of_queries"], cum_df[col2sortby], label=f'{name_mapping[tool]}, AUC={auc:.2f}')
    
# Adding titles and labels

plt.xlabel('Fraction of queries')
plt.ylabel('SFFP')
plt.legend()
plt.savefig("../figures/default_vs_profile_ranking_sffp.png")
plt.show()

In [None]:
# The properties of the seeds are calculated to see in which parts each group performs better
base_dir = ".."
data_dir = f"{base_dir}/data/"

# Load domain features
pi_df = pd.read_csv(f"{data_dir}/processed/avg_intra_fam_pident.tsv", sep="\t") #pi means percentage identity
ss_info_df = pd.read_csv(f"{data_dir}/processed/ss_info_pfam.tsv", sep="\t")
cn_df = pd.read_csv(f"{data_dir}/processed/avg_contact_num.tsv", sep="\t")
plddt_df = pd.read_csv(f"{data_dir}/processed/pfam_avg_plddt.tsv", sep="\t")
plddt_df["size"] = plddt_df["seed_id"].str.split("-", expand=True)[2].astype(int) - plddt_df["seed_id"].str.split("-", expand=True)[1].astype(int) + 1

protperties_df = pi_df.merge(ss_info_df, on="seed_id").merge(cn_df, on="seed_id").merge(plddt_df, on="seed_id")

In [None]:
tools = [x for x in sffp_dict.keys() if "_bitscore" not in x]

In [None]:
tools

In [None]:
for tool in tools:
    print(f"Section started for {tool}\n##########################")
    ori_df = sffp_dict[tool]               # Use the ones with original order
    bit_df = sffp_dict[f"{tool}_bitscore"] # Use the ones sorted by psiblast scoring scheme
    next2next = ori_df.merge(bit_df, on="query", suffixes=("_ori", "_bit")).rename(columns={"query": "seed_id"})
    next2next = next2next.merge(protperties_df, on="seed_id")
    seed_characteristics = list(protperties_df.columns[1:])
    ori_better = next2next[next2next["sffp_pfam_ori"] > next2next["sffp_pfam_bit"]]
    bit_better = next2next[next2next["sffp_pfam_ori"] < next2next["sffp_pfam_bit"]]
    print(f"For {len(ori_better)} queries, the original ranking has a higher SFFP")
    print(f"For {len(bit_better)} queries, the profile based ranking has a higher SFFP")
    print("Information is presented first for those which default ranking had a higher performance followed by those which rescored raking had a higher performance")
    for property in seed_characteristics:
        print(property)
        print(ori_better[property].mean())
        print(bit_better[property].mean())

# Find the high performing tool for each query

In [None]:
from scipy.stats import spearmanr

for tool in tools:
    print(f"Section started for {tool}\n##########################")
    ori_df = sffp_dict[tool]               # Use the ones with original order
    bit_df = sffp_dict[f"{tool}_bitscore"] # Use the ones sorted by psiblast scoring scheme
    next2next = ori_df.merge(bit_df, on="query", suffixes=("_ori", "_bit")).rename(columns={"query": "seed_id"})
    next2next = next2next.merge(protperties_df, on="seed_id")
    next2next["perf_diff"] = next2next["sffp_pfam_ori"] - next2next["sffp_pfam_bit"]
    seed_characteristics = list(protperties_df.columns[1:])
    
    for property in seed_characteristics:
        rho, p = spearmanr(next2next['perf_diff'], next2next[property], nan_policy='omit')
        print(f"The correlation coefficient between {property} and is the performance difference is {rho}, and p-value is {p}")

# Conclusion:

It looks like psiblast scoring works better for short domains. So, let's train a model to score the hits based on the psiblast_bitscore, pwa_aligner bitscore, and the target length and see if it can improve the performance.

In [None]:
sffp_dict = {os.path.basename(x).replace(".tsv", ""): pd.read_csv(x, sep="\t") for x in glob.glob("../tmp/sffp_rescoring/*_exc_b12.tsv")}
sffp_dict = {x:y for x,y in sffp_dict.items() if "rescored" not in x and "bitscore" not in x}


def cum_sffp(sffp_df, col2sortby):
    """Receiving a dataframe of SFFP, it will return the cumulative SFFP plot"""
    sorted_df = sffp_df.sort_values(by=col2sortby, ascending=False)
    sorted_df = sorted_df.reset_index(drop=True)
    sorted_df["fraction_of_queries"] = (sorted_df.index + 1)/len(sorted_df)
    return sorted_df


plt.figure(figsize=(10, 6))

col2sortby = "sffp_pfam"
for tool, tool_df in sffp_dict.items():
    
    cum_df = cum_sffp(tool_df, col2sortby)
    x, y = cum_df["fraction_of_queries"], cum_df[col2sortby]
    auc = np.trapz(y, x)
    plt.plot(x, y, label=f'{tool}, AUC = {auc:.2f}')
    
# Adding titles and labels

plt.xlabel('X-axis')

plt.ylabel('Y-axis')
plt.axhline(0, color='black', lw=0.5, ls='--')  # Optional: Add a horizontal line at y=0
plt.axvline(0, color='black', lw=0.5, ls='--')  # Optional: Add a vertical line at x=0

plt.legend()
plt.savefig("../figures/sffp_ml_equal_tp_fp_evalue_as_threshold_four_size_bins.png")
plt.show()

In [None]:
file_path_dict = {"foldseek": "../tmp/sffp_rescoring/foldseek.tsv",
                  "reseek": "../tmp/sffp_rescoring/reseek.tsv",
                  "foldseek_norm_score": "../tmp/sffp_rescoring/foldseek_norm_score.tsv",
                  "reseek_norm_score": "../tmp/sffp_rescoring/reseek_norm_score.tsv"}

In [None]:
sffp_dict = {os.path.basename(x).replace(".tsv", ""): pd.read_csv(x, sep="\t") for x in file_path_dict.values()}


def cum_sffp(sffp_df, col2sortby):
    """Receiving a dataframe of SFFP, it will return the cumulative SFFP plot"""
    sorted_df = sffp_df.sort_values(by=col2sortby, ascending=False)
    sorted_df = sorted_df.reset_index(drop=True)
    sorted_df["fraction_of_queries"] = (sorted_df.index + 1)/len(sorted_df)
    return sorted_df


plt.figure(figsize=(10, 6))

col2sortby = "sffp_pfam"
for tool, tool_df in sffp_dict.items():
    
    cum_df = cum_sffp(tool_df, col2sortby)
    x, y = cum_df["fraction_of_queries"], cum_df[col2sortby]
    auc = np.trapz(y, x)
    plt.plot(x, y, label=f'{tool}, AUC = {auc:.2f}')
    
# Adding titles and labels

plt.xlabel('X-axis')

plt.ylabel('Y-axis')
plt.axhline(0, color='black', lw=0.5, ls='--')  # Optional: Add a horizontal line at y=0
plt.axvline(0, color='black', lw=0.5, ls='--')  # Optional: Add a vertical line at x=0

plt.legend()
plt.savefig("../figures/sffp_ml_equal_tp_fp_evalue_as_threshold_four_size_bins.png")
plt.show()