# Adding 2 Columns of rank score for esm1b and alphamissense to compare those models
 - To facilitate comparison between scores, we added rank scores for most functional prediction scores and conservation scores, and replacing the  "converted" scores in the previous versions.

 - AlphaMissense_rankscore: AlphaMissense scores were ranked among all AlphaMissense scores in dbNSFP. The rankscore is the ratio of the rank of the AlphaMissense_score over the total number of scores in dbNSFP.
 - ESM1b_rankscore: ESM1b scores were firstly negated (i.e., -ESM1b_score), then ranked among all -ESM1b_score scores
in dbNSFP. The rankscore is the ratio of the rank of the -ESM1b_score over the total number of scores in dbNSFP.

Steps to do:

0. ESM1b - first negate scores -ESM1b LLR score
1. get all scores from esm1b and alphamissense
2. rank them 
3. divide over total number of scores 

In [5]:
import os
import pandas as pd
from scipy.stats import rankdata
import numpy as np
from glob import glob
from tqdm.notebook import tqdm

In [None]:
# Paths
input_dir = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/raw/AlphaMissense_csv"
output_dir = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/processed/AlphaMissense_rank_csv"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get all CSV files
csv_files = glob(os.path.join(input_dir, "*.csv"))

# Standard amino acids (excluding stop codon or non-standard residues)
aa_list = [
    "A", "V", "L", "I", "M", "F", "W",  # Hydrophobic
    "S", "T", "N", "Q", "Y", "C",      # Polar uncharged
    "K", "R", "H",                     # Positively charged
    "D", "E",                          # Negatively charged
    "G", "P"                           # Special
]

# Define columns you want to keep
columns_to_keep = [
    "uniprot_id", "variation", "am_pathogenicity",
    "residue", "residue_position"]


# First pass: collect all scores
print("Collecting scores...")
all_scores = []
csv_files = glob(os.path.join(input_dir, "*.csv"))
for csv_file in tqdm(csv_files):
    df = pd.read_csv(csv_file, usecols=["am_pathogenicity"])
    all_scores.extend(df["am_pathogenicity"].astype("float32").values)

# Compute global rank
print("Computing global ranks...")
all_scores = np.array(all_scores, dtype="float32")
ranks = rankdata(all_scores, method="average")
normalized_ranks = np.round(ranks / len(all_scores), 5)

# Second pass: process file-by-file
print("Processing per-file and writing outputs...")
score_index = 0
for csv_file in tqdm(csv_files):
    df = pd.read_csv(csv_file, usecols=columns_to_keep)

    n_rows = len(df)
    df["rank_score"] = normalized_ranks[score_index:score_index + n_rows]
    score_index += n_rows

    df["pos_label"] = df["residue"] + " " + df["residue_position"].astype(str)

    # Process each protein in this file
    for protein_id, group in df.groupby("uniprot_id"):
        pivot = group.pivot(index="variation", columns="pos_label", values="rank_score")
        pivot = pivot.reindex(aa_list)

        # Sort columns like "M 1", "R 2", ...
        try:
            pivot = pivot[sorted(pivot.columns, key=lambda x: int(x.split()[1]))]
        except Exception:
            pass  # If columns are missing or malformed

        output_file = os.path.join(output_dir, f"{protein_id}_rank.csv")
        pivot.to_csv(output_file)

print("Done.")

In [None]:
import shutil

esm_pathway = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/raw/ALL_hum_isoforms_ESM1b_LLR"

#Exclude every CSV file that has additional "-" sign in its name and move/copy the non "-" files to file:

esm_no_isoform_pathway = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/processed/ESM1b_no_isoforms"

# Create destination directory if it doesn't exist
os.makedirs(esm_no_isoform_pathway, exist_ok=True)

# Get all CSV files
all_csv_files = glob(os.path.join(esm_pathway, "*.csv"))

# Filter out files that contain a hyphen in the filename
no_isoform_files = [f for f in all_csv_files if "-" not in os.path.basename(f)]

# Copy each valid file to the new directory
for file_path in tqdm(no_isoform_files, desc="Copying files without '-'"):
    filename = os.path.basename(file_path)
    dest_path = os.path.join(esm_no_isoform_pathway, filename)
    shutil.copy(file_path, dest_path)

print("Done: copied all non-isoform files.")

In [None]:
# now rank-scores for ESM1b pathogenicity scores: 

# Paths
input_dir = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/processed/ESM1b_no_isoforms"
output_dir = "/Users/doma/Documents/Bachelor_Arbeit/Code/data/processed/ESM1b_rank_csv"
os.makedirs(output_dir, exist_ok=True)

# Canonical amino acids (row order)
aa_list = [
    "A", "V", "L", "I", "M", "F", "W",
    "S", "T", "N", "Q", "Y", "C",
    "K", "R", "H",
    "D", "E",
    "G", "P"
]

# First pass: collect all LLR values
print("Collecting LLR scores...")
all_scores = []
file_map = []
csv_files = glob(os.path.join(input_dir, "*.csv"))

for file in tqdm(csv_files):
    df = pd.read_csv(file, index_col=0)
    flat_scores = df.values.flatten()
    valid_mask = (~np.isnan(flat_scores)) & (flat_scores != 0)  # Exclude NaNs and zeros
    all_scores.extend(flat_scores[valid_mask])
    file_map.append((file, df))

# Global ranking
print("Computing global ranks...")
all_scores = np.array(all_scores, dtype="float32")
ranks = rankdata(-all_scores, method="average")
normalized_ranks = np.round(ranks / len(all_scores), 5)

# Second pass: rebuild each matrix with rank scores
print("Writing ranked matrices...")
score_index = 0

for file, df in tqdm(file_map):
    flat_llrs = df.values.flatten()
    flat_ranks = np.full_like(flat_llrs, np.nan, dtype="float32")

    valid_mask = (~np.isnan(flat_llrs)) & (flat_llrs != 0)
    flat_ranks[valid_mask] = normalized_ranks[score_index:score_index + valid_mask.sum()]
    score_index += valid_mask.sum()

    rank_matrix = pd.DataFrame(
        flat_ranks.reshape(df.shape),
        index=df.index,
        columns=df.columns
    )

    # Optional: reorder rows to standard AA list
    rank_matrix = rank_matrix.reindex(aa_list)

    outname = os.path.basename(file).replace(".csv", "_rank.csv")
    outpath = os.path.join(output_dir, outname)
    rank_matrix.to_csv(outpath, float_format="%.5f")

print("All ESM1b rank matrices saved.")

: 