# Introduction

In this notebook, the performance of residue-level alignments are plotted. Plotting is done for all residues of the targer, or for residues with specific characteristics such as pocket forming residues or conserved residues.

Plots are made either by considering each query target as an entry, or by averaging the fraction of correctly aligned residues for all pairwise comparisons in a family

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
tools = ["mm", "fs", "fs3di", "rs", "tm"]
reisdue_types = ["all", "conserved", "pocket"]

In [None]:
db_name = {"mm": "MMseqs", "fs": "Foldseek", "fs3di": "Foldseek (3Di)", "rs": "Reseek", "tm": "TM-align"}
fig_dir = "../figures/"

In [None]:
def accumulate_res_alignment_data_instance_level(file_paths):
    """Takes the paths to the files containing the query, target, the number of correctly aligned residues, and the total number of
    aligned residues in the gold standard as input.In the end, it says how many entries have X percentage of correctly aligned residues."""
    
    all_fam_data = pd.DataFrame()
    for path in file_paths:
        df = pd.read_csv(path, sep="\t")
        df = df[df["all"]>0]
        df["correctly_aligned_fraction"] = (df["correct"]/df["all"]).round(2)
        summary = df.groupby("correctly_aligned_fraction")[["correctly_aligned_fraction"]].agg("count")
        summary = summary.rename(columns= {"correctly_aligned_fraction": "count"})
        all_fam_data = all_fam_data.add(summary, fill_value=0)
    all_fam_data = all_fam_data.reset_index()
    return all_fam_data

In [None]:
Note: I must have 3 levels of investigation: 
* row level
* instance level
* family level

In [None]:
def accumulate_res_alignment_data_family_level(file_paths):
    """Takes the paths to the files containing the query, target, the number of correctly aligned residues, and the total number of
    aligned residues in the gold standard as input. It uses the average alignment ratio for each family. In the end, it says how many entries
    have X percentage of correctly aligned residues."""
    all_fam_data = pd.DataFrame()
    for path in file_paths:
        df = pd.read_csv(path, sep="\t")
        df = df[df["all"]>0]
        df["correctly_aligned_fraction"] = (df["correct"]/df["all"])
        avg_res = round(df["correctly_aligned_fraction"].mean(), 2)
        if not(np.isnan(avg_res)):
            summary = pd.DataFrame([[1, avg_res]], columns=["count", "correctly_aligned_fraction"]).set_index("correctly_aligned_fraction")
            all_fam_data = all_fam_data.add(summary, fill_value=0)
    all_fam_data = all_fam_data.reset_index()
    return all_fam_data

In [None]:
def get_cumulative_res_alignment(df):
    sorted_df = df.sort_values(by="correctly_aligned_fraction", ascending=False)
    total_count = sorted_df["count"].sum()
    sorted_df["cum_frac"] = sorted_df["count"].cumsum() / total_count
    return sorted_df

In [None]:
def plot_cum_res_alignment_plots(cum_df, tool_name):
    y_axis = np.insert(cum_df["correctly_aligned_fraction"], 0, 1) # 1 is inserted in the beginning to make sure AUC calculation considers y=1 for small x values
    x_axis = np.insert(cum_df["cum_frac"], 0, 0)                   # 1 is inserted in the beginning to make sure AUC calculation considers y=1 for small x values
    auc = np.trapz( y_axis, x_axis) 
    plt.plot(x_axis, y_axis, label=f"{db_name[tool_name]} (AUC = {auc:.2f})")

## Residue level alignment at the instance level alignment (all residues)

In [None]:
res_type = "all"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_all_seed = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_instance_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_all_seed.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()

## Residue level alignment at the family level alignment (all residues)

In [None]:
res_type = "all"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_all_fam = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_family_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_all_fam.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()

## Residue level alignment at the instance level alignment (pocket residues)

In [None]:
res_type = "pocket"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_pocket_seed = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_instance_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_pocket_seed.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()

## Residue level alignment at the family level alignment (pocket residues)

In [None]:
res_type = "pocket"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_pocket_fam = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_family_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_pocket_fam.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()

## Residue level alignment at the instance level alignment (conserved residues)

In [None]:
res_type = "conserved"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_conserved_seed = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_instance_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_conserved_seed.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()

## Residue level alignment at the family level alignment (conserved residues)

In [None]:
res_type = "conserved"
file_paths_dict = {tool :glob.glob(f"../tmp/intrafam_residue_alignment_counts/{tool}/{res_type}/*.tsv") for tool in tools}
cum_data_conserved_fam = {tool: get_cumulative_res_alignment(accumulate_res_alignment_data_family_level(file_paths_dict[tool])) for tool in tools}

In [None]:
plt.figure(dpi=300)

for tool, cum_df in cum_data_conserved_fam.items():
    plot_cum_res_alignment_plots(cum_df, tool)

plt.xlabel("Fraction of entries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
#plt.savefig(f"{fig_dir}/res.png")
#plt.show()