# Bias of Pathogenicity Scores Towards Specific Amino Acids and SASA

### Important: SASA - paper identified that pathogenicity scores are higher in buried aminoacids than in surface accessible a.a.

In [None]:
# Compute the average pathogenicity scores in both models for each amino acid type
     # 1. for N-out Proteome
     # 2. for Multispan proteome
     # 3. for Human proteome
     # 2.1,2,3 for their differences - In what amino acid types do the models differ?

# Look at hydrophobic amino acids 
     # N-out Proteome
     # Multispan proteome
     # Human proteome

# plot a 20x20 Heatmap with average rank scores per mutation, and then average score per amino acid reference type

In [1]:
# --- Project Setup ---
from setup_notebook import setup_project_root
setup_project_root()

from src.project_config import get_paths, get_paths_protein, get_aa_list
import pandas as pd
import os
from tqdm.notebook import tqdm
import numpy as np
from os import mkdir

In [6]:
# Human Proteome
# 1. Iterate over every file in both folders
# 2. For each file, read all the rank scores of each amino acid variant type (row)
# 3. iterate over each amino acid type 20x 
# 4. Compute the average rank score for each amino acid type


# intersection proteins for ESM and AM
# protein_ids = pd.read_csv(get_paths()["protein_ids_intersection"])   all human proteome
#protein_ids = pd.read_csv(get_paths()["processed"] / "N_out_proteome_cleaned.csv") # # N-out Proteome
protein_ids = pd.read_csv(get_paths()["processed"] / "Multispan_proteome_cleaned.csv")  # Multispan Proteome

aa_list = get_aa_list()

# Task 1: Store global averages per variant type
variant_global_avg = {}


# Initialize matrix with zeros; we'll fill it row-by-row
# Initialize final 20x20 matrices
esm_matrix = pd.DataFrame(0.0, index=aa_list, columns=aa_list)
am_matrix = pd.DataFrame(0.0, index=aa_list, columns=aa_list)


# Loop over each variant amino acid 
for variant in tqdm(aa_list, desc="Outer loop over variant AAs"):

    # Collect all raw scores for this variant (task 1)
    esm_all_variant_scores = []
    am_all_variant_scores = []

    # Task 2: collect per-reference scores
    esm_ref_scores_by_aa = {ref_aa: [] for ref_aa in aa_list}
    am_ref_scores_by_aa = {ref_aa: [] for ref_aa in aa_list}

    # Iterate over each protein in the intersection and collect all scores for one variant type (rows)
    for protein in tqdm(protein_ids["Entry"], desc=f"Inner loop over proteins. Variant: {variant}", leave=False):

        # Load data
        esm_file = pd.read_csv(get_paths_protein(protein)["esm_path"], index_col=0)
        am_file = pd.read_csv(get_paths_protein(protein)["am_path"], index_col=0)
        
        # Extract the row for the variant
        esm_variant_row = esm_file.loc[variant]
        am_variant_row = am_file.loc[variant]

        # Flatten and clean scores
        esm_values = esm_variant_row.dropna().values
        am_values = am_variant_row.dropna().values
        esm_all_variant_scores.extend(esm_values)
        am_all_variant_scores.extend(am_values)

        # Iterate over each position (column) to extract the reference AA
        for col in esm_variant_row.index:
            if pd.isna(esm_variant_row[col]):
                continue
            try:
                ref_aa = col.split()[0]  # e.g., "M 1" -> "M"
                if ref_aa in aa_list:
                    esm_ref_scores_by_aa[ref_aa].append(esm_variant_row[col])
            except Exception:
                continue

        for col in am_variant_row.index:
            if pd.isna(am_variant_row[col]):
                continue
            try:
                ref_aa = col.split()[0]
                if ref_aa in aa_list:
                    am_ref_scores_by_aa[ref_aa].append(am_variant_row[col])
            except Exception:
                continue

    # Compute average variant score and fill diagonal with 0
    esm_avg_scores = {
        ref: np.mean(scores) if ref != variant and scores else 0.0
        for ref, scores in esm_ref_scores_by_aa.items()
    }

    am_avg_scores = {
        ref: np.mean(scores) if ref != variant and scores else 0.0
        for ref, scores in am_ref_scores_by_aa.items()
    }

    # Fill matrices
    for ref_aa in aa_list:
        esm_matrix.loc[variant, ref_aa] = esm_avg_scores.get(ref_aa, 0.0)
        am_matrix.loc[variant, ref_aa] = am_avg_scores.get(ref_aa, 0.0)

# Optional: Save the matrices
#os.mkdir(get_paths()["processed"] / "5.6.Bias", exist_ok=True)
esm_matrix.to_csv(get_paths()["processed"] / "5.6.Bias" / "esm_20x20_matrix_multispan.csv")
am_matrix.to_csv(get_paths()["processed"] / "5.6.Bias" / "am_20x20_matrix_multispan.csv")
print("Saved ESM and AM 20x20 matrices.")

Outer loop over variant AAs:   0%|          | 0/20 [00:00<?, ?it/s]

Inner loop over proteins. Variant: A:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: V:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: L:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: I:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: M:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: F:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: W:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: S:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: T:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: N:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: Q:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: Y:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: C:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: K:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: R:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: H:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: D:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: E:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: G:   0%|          | 0/2822 [00:00<?, ?it/s]

Inner loop over proteins. Variant: P:   0%|          | 0/2822 [00:00<?, ?it/s]

Saved ESM and AM 20x20 matrices.
