In [None]:
import json
import glob
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.interpolate import interp1d
import xlsxwriter
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_squared_error  
import seaborn as sns
import matplotlib.pyplot as plt

## Overall residue alignment

In [None]:
base_dir = ".."
tool_name = {"fs3di": "Foldseek (3Di)", "fs": "Foldseek", "mm": "MMseqs", "rs": "Reseek", "tm": "TM-align", "hmmscan": "Hmmscan"}
fig_dir = f"{base_dir}/figures/"
data_dir = f"{base_dir}/data/"

with open(f"{data_dir}/processed/residue_alignment_performance.pkl", 'rb') as f:
    res_ali_data = pickle.load(f)

writer = pd.ExcelWriter(f'{data_dir}/processed/residue_alignment_auc_ci.xlsx', engine='xlsxwriter')

# Overall residue alignment

In [None]:
plt.figure(dpi=300)

overall_res_data = res_ali_data["overall"]
ci_rows = []
for key, tool_data in overall_res_data.items():
    y_axis = tool_data['y_axis']
    x_axis = tool_data['x_axis']
    auc = tool_data['auc']
    plt.plot(x_axis, y_axis, label=f"{tool_name[key]} (AUC = {auc:.2f})")
    ci_rows.append([tool_name[key], auc, tool_data['lower_ci_auc'], tool_data['upper_ci_auc']])

plt.xlabel("Fraction of queries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
plt.savefig(f"{fig_dir}/overall_residue_alignment_split_vs_split.png")
plt.show()
ci_df = pd.DataFrame(ci_rows, columns=["Tool", "AUC", "Lower confidence interval", "Upper confidence interval"]).sort_values(by="AUC").reset_index(drop=True)

ci_df.to_excel(writer, sheet_name="Overall residue alignment", index=None)

# Conserved residue alignment

In [None]:
plt.figure(dpi=300)

overall_res_data = res_ali_data["conserved"]
ci_rows = []
for key, tool_data in overall_res_data.items():
    y_axis = tool_data['y_axis']
    x_axis = tool_data['x_axis']
    auc = tool_data['auc']
    plt.plot(x_axis, y_axis, label=f"{tool_name[key]} (AUC = {auc:.2f})")
    ci_rows.append([tool_name[key], auc, tool_data['lower_ci_auc'], tool_data['upper_ci_auc']])

plt.xlabel("Fraction of queries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
plt.savefig(f"{fig_dir}/conserved_residue_alignment_split_vs_split.png")
plt.show()
ci_df = pd.DataFrame(ci_rows, columns=["Tool", "AUC", "Lower confidence interval", "Upper confidence interval"]).sort_values(by="AUC").reset_index(drop=True)

ci_df.to_excel(writer, sheet_name="Conserved residue alignment", index=None)

# Predicted active sites residue alignment

In [None]:
plt.figure(dpi=300)

overall_res_data = res_ali_data["active_sites"]
ci_rows = []
for key, tool_data in overall_res_data.items():
    y_axis = tool_data['y_axis']
    x_axis = tool_data['x_axis']
    auc = tool_data['auc']
    plt.plot(x_axis, y_axis, label=f"{tool_name[key]} (AUC = {auc:.2f})")
    ci_rows.append([tool_name[key], auc, tool_data['lower_ci_auc'], tool_data['upper_ci_auc']])

plt.xlabel("Fraction of queries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
plt.savefig(f"{fig_dir}/active_sites_residue_alignment_split_vs_split.png")
plt.show()
ci_df = pd.DataFrame(ci_rows, columns=["Tool", "AUC", "Lower confidence interval", "Upper confidence interval"]).sort_values(by="AUC").reset_index(drop=True)

ci_df.to_excel(writer, sheet_name="Active site residue alignment", index=None)

# Residue alignment for predicted binding sites

In [None]:
plt.figure(dpi=300)

overall_res_data = res_ali_data["binding_sites"]
ci_rows = []
for key, tool_data in overall_res_data.items():
    y_axis = tool_data['y_axis']
    x_axis = tool_data['x_axis']
    auc = tool_data['auc']
    plt.plot(x_axis, y_axis, label=f"{tool_name[key]} (AUC = {auc:.2f})")
    ci_rows.append([tool_name[key], auc, tool_data['lower_ci_auc'], tool_data['upper_ci_auc']])

plt.xlabel("Fraction of queries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
plt.savefig(f"{fig_dir}/binding_sites_residue_alignment_split_vs_split.png")
plt.show()
ci_df = pd.DataFrame(ci_rows, columns=["Tool", "AUC", "Lower confidence interval", "Upper confidence interval"]).sort_values(by="AUC").reset_index(drop=True)

ci_df.to_excel(writer, sheet_name="Binding site residue alignment", index=None)

Direct comparison of previous plots shows that there is a higher chance to correct the conserved residues with each other correctly, compared to the background. One hypothesis is that the structures around the conserved sites are modeled with a high accuracy. This can be judged based on pLDDT. In the next step, we are going to find the difference between the pLDDT of conserved residues and those of the background.

# Residue alignment for seeds with low confidence conserved sites

In [None]:
plt.figure(dpi=300)

overall_res_data = res_ali_data["low_confidence_conserved_sites"]
ci_rows = []
for key, tool_data in overall_res_data.items():
    y_axis = tool_data['y_axis']
    x_axis = tool_data['x_axis']
    auc = tool_data['auc']
    plt.plot(x_axis, y_axis, label=f"{tool_name[key]} (AUC = {auc:.2f})")
    ci_rows.append([tool_name[key], auc, tool_data['lower_ci_auc'], tool_data['upper_ci_auc']])

plt.xlabel("Fraction of queries")
plt.ylabel("Fraction of correctly aligned residues")
plt.legend()
plt.savefig(f"{fig_dir}/low_confidence_conserved_sites_residue_alignment_split_vs_split.png")
plt.show()
ci_df = pd.DataFrame(ci_rows, columns=["Tool", "AUC", "Lower confidence interval", "Upper confidence interval"]).sort_values(by="AUC").reset_index(drop=True)

ci_df.to_excel(writer, sheet_name="Low conf cons res alignment", index=None)
writer.close() # All CIs have been written