In [86]:
import pandas as pd, re, os, matplotlib.pyplot as plt, numpy as np, collections, itertools
from matplotlib import rcParams
rcParams['font.family'] = "P052-Roman"
try:
    %load_ext autoreload
    %autoreload 2
    %config InlineBackend.figure_format = 'svg'
except SyntaxError:
    pass

os.chdir("/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/discovery/nature_atlas/")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
OUR_RESULTS_FILE = "../../out/results_2023-02-22@22`44`25.2@+00`00.csv"
THEIR_RESULTS_FILE = "./41586_2022_5575_MOESM5_ESM.csv"

In [88]:
correctly_sort_index = lambda index: \
    [
        (x.split("|")[0],
        x.split("|")[1],
        np.mean([int(y[1:]) for y in eval(x.split("|")[2])])
        )
        for x in index
    ]

In [89]:
results = pd.read_csv(OUR_RESULTS_FILE)
results["KinSymbol"] = results["Kinase Gene Name"] + "|" + results["Kinase Uniprot Accession"]
results["SiteSymbol"] = (
    results["Site Gene Name"] + "|" + results["Site Uniprot Accession"] + "|" + results["Site Location"]
)
results = results[["KinSymbol", "SiteSymbol", "Score"]]
kin_to_site_to_score = results.set_index(["KinSymbol", "SiteSymbol"]).to_dict()["Score"]
cols = {x: {} for x in results["KinSymbol"].unique()}
for (kin, site), score in kin_to_site_to_score.items():
    cols[kin][site] = score
orig_keys = list(cols.keys())
for c in orig_keys:
    cols[c + "^Score"] = cols.pop(c)
derived_df = pd.DataFrame.from_dict(cols)
ranked = (
    derived_df.rank(axis=1, method="max")
    .rename(columns={c: str(c).replace("^Score", "^Rank") for c in derived_df.columns})
    .astype(int)
)
our_df_to_compare = pd.concat([derived_df, ranked], axis=1)
our_df_to_compare = our_df_to_compare[
    list(
        itertools.chain(
            *[
                [base_c, str(base_c).replace("^Score", "^Rank")]
                for base_c in our_df_to_compare.columns[: len(our_df_to_compare.columns) // 2]
            ]
        )
    )
]
our_df_to_compare = our_df_to_compare.sort_index(axis='columns')
our_ranks = our_df_to_compare[[x for x in our_df_to_compare.columns if "Rank" in str(x)]].copy()
our_ranks = our_ranks.sort_index(axis='index', key=correctly_sort_index, inplace=False)
our_ranks.to_csv("./our_ranks.csv")

In [None]:
their_results = pd.read_csv(THEIR_RESULTS_FILE).set_index("Uniprot Primary Accession")
matrix_name_to_uniprot_id: dict[str, str] = (
    pd.read_csv("./41586_2022_5575_MOESM3_ESM.csv").set_index("Matrix_name").to_dict()["Uniprot id"]
)

their_site_flk_seqs_to_locs = collections.defaultdict(list[str])
their_site_flk_seqs_to_upids = collections.defaultdict(list[str])
their_site_flk_seqs_to_genes = collections.defaultdict(list[str])
for i, r in their_results.iterrows():
    flk_seq = r["SITE_+/-7_AA"]  # flk_seq.split("|")
    their_site_flk_seqs_to_locs[flk_seq].append(r["Phosphosite"])
    their_site_flk_seqs_to_upids[flk_seq].append(str(i))  # AKA Uniprot ID
    their_site_flk_seqs_to_genes[flk_seq].append(r["Gene"] if not pd.isna(r["Gene"]) else f"<UNK>UnipAc:{i}")

# Argsort each of the defaultdicts by the uniprot id
for flk_seq in their_site_flk_seqs_to_locs:
    argst = np.argsort(their_site_flk_seqs_to_upids[flk_seq])
    their_site_flk_seqs_to_locs[flk_seq] = [their_site_flk_seqs_to_locs[flk_seq][i] for i in argst]
    their_site_flk_seqs_to_upids[flk_seq] = [their_site_flk_seqs_to_upids[flk_seq][i] for i in argst]
    their_site_flk_seqs_to_genes[flk_seq] = [their_site_flk_seqs_to_genes[flk_seq][i] for i in argst]

their_sites_to_symbols = {
    flk: f"""{their_site_flk_seqs_to_genes[flk]}|{their_site_flk_seqs_to_upids[flk]}|{their_site_flk_seqs_to_locs[flk]}"""
    for flk in their_site_flk_seqs_to_locs
}
new_idx = [their_sites_to_symbols[r["SITE_+/-7_AA"]] for _, r in their_results.iterrows()]
their_results.index = pd.Index(new_idx)

their_kins_to_symbols = {
    tk: f"['{re.sub(r'_rank', r'', tk)}']|['{matrix_name_to_uniprot_id[re.sub(r'_rank', r'', tk)]}']^Rank"
    for tk in [str(x) for x in their_results.columns]
    if tk.endswith("_rank")
}
their_df_to_compare = their_results.rename(columns=their_kins_to_symbols)[list(their_kins_to_symbols.values())]
their_ranks = their_df_to_compare[our_ranks.columns].copy()
their_ranks = their_ranks.rank(axis=1, method="max").astype(int)
their_ranks.sort_index(axis="index", inplace=True)
their_ranks.to_csv("./their_ranks.csv")


In [None]:
normalize_ranks = lambda obj_to_norm, starting_ranks, ending_ranks: obj_to_norm*(ending_ranks-1)/(starting_ranks-1) + (starting_ranks-ending_ranks)/(starting_ranks-1)

In [None]:
assert set(our_ranks.index) == set(their_ranks.index)