### This notebook will produce the following:
    - `our_ranks.csv`
    - `their_ranks.csv`
    - `our_percentiles.csv`
    - `their_percentiles.csv`
    - `kinase_sites_ST_<sample size>.csv`
    - `our_raw_scores.csv`

#### Data Science Imports

In [18]:
import pandas as pd, re, os, matplotlib.pyplot as plt, numpy as np, collections, itertools, tqdm, random, textwrap as tw, json
from matplotlib.markers import MarkerStyle
from matplotlib import rcParams
rcParams['font.family'] = "P052"
try:
    pass
    %load_ext autoreload
    %autoreload 2
    %config InlineBackend.figure_format = 'svg'
except SyntaxError:
    pass

os.chdir("/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/discovery/nature_atlas/")
OUR_RESULTS_FILE = "/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/out/results_2023-03-13@23`32`26.0@+00`00.csv"
THEIR_RESULTS_FILE = "./41586_2022_5575_MOESM5_ESM.csv"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Defining Functions

In [6]:
correctly_sort_index = lambda index: \
    [
        (x.split("|")[0],
        x.split("|")[1],
        np.mean([int(y[1:]) for y in eval(x.split("|")[2])])
        )
        for x in index
    ]

def efficient_get_pctl(arr, round_to=1):
    sorted_order = np.argsort(arr)
    arr_sorted = arr[sorted_order]
    tot_len = len(arr)
    res = []
    i = 0
    while i < tot_len:
        num_bigger_than = i
        pctl_strict_bigger = num_bigger_than * 100 / tot_len
        next_diff_value = i + 1
        while next_diff_value < tot_len and arr_sorted[next_diff_value] == arr_sorted[i]:
            next_diff_value += 1
        res += [pctl_strict_bigger] * (next_diff_value - i)

        i += next_diff_value - i
    return np.round(np.array(res)[np.argsort(sorted_order)], round_to)

def efficient_get_pctl_2d(arr, round_to=1):
    res = []
    for a in arr.T:
        res.append(efficient_get_pctl(a, round_to))
    return np.asarray(res).T

#### Preparing PSP Results

In [16]:
results = pd.read_csv(OUR_RESULTS_FILE)
results["KinSymbol"] = results["Kinase Gene Name"] + "|" + results["Kinase Uniprot Accession"]
results["SiteSymbol"] = (
    results["Site Gene Name"] + "|" + results["Site Uniprot Accession"] + "|" + results["Site Location"]
)
results = results[["KinSymbol", "SiteSymbol", "Score"]]
kin_to_site_to_score = results.set_index(["KinSymbol", "SiteSymbol"]).to_dict()["Score"]
cols = {x: {} for x in results["KinSymbol"].unique()}
for (kin, site), score in kin_to_site_to_score.items():
    cols[kin][site] = score
orig_keys = list(cols.keys())
for c in orig_keys:
    cols[c + "^Score"] = cols.pop(c)
derived_df = pd.DataFrame.from_dict(cols)
percentiled: np.ndarray = efficient_get_pctl_2d(derived_df.values)
for col in derived_df.columns:
    assert isinstance(col, str)
    derived_df[col + "^Pctl"] = percentiled[:, derived_df.columns.get_loc(col)]
derived_df.sort_index(axis='index', key=correctly_sort_index, inplace=True)
our_raw_scores = derived_df[[x for x in derived_df.columns if "Score" in str(x) and "Pctl" not in str(x)]].copy()
our_raw_scores.to_csv("./our_raw_scores.csv")
derived_df = derived_df.drop(columns=[col for col in derived_df.columns if "^Pctl" not in str(col)], inplace=False)
ranked = (
    derived_df.rank(axis=1, method="min")
    .rename(columns={c: str(c).replace("^Score", "^Rank") for c in derived_df.columns})
    .astype(int)
)
our_df_to_compare = pd.concat([derived_df, ranked], axis=1)
our_df_to_compare = our_df_to_compare[
    list(
        itertools.chain(
            *[
                [base_c, str(base_c).replace("^Score", "^Rank")]
                for base_c in our_df_to_compare.columns[: len(our_df_to_compare.columns) // 2]
            ]
        )
    )
]
our_df_to_compare = our_df_to_compare.sort_index(axis='columns')
our_ranks = our_df_to_compare[[x for x in our_df_to_compare.columns if "Rank" in str(x)]].copy()
derived_df.sort_index(axis='index', key=correctly_sort_index, inplace=True)
derived_df.to_csv("./our_percentiles.csv")
our_ranks.sort_index(axis='index', key=correctly_sort_index, inplace=True)
our_ranks.to_csv("./our_ranks.csv")


#### Preparing Atlas Results

In [17]:
their_results = pd.read_csv(THEIR_RESULTS_FILE).set_index("Uniprot Primary Accession")
matrix_name_to_uniprot_id: dict[str, str] = (
    pd.read_csv("./41586_2022_5575_MOESM3_ESM.csv").set_index("Matrix_name").to_dict()["Uniprot id"]
)

their_site_flk_seqs_to_locs = collections.defaultdict(list[str])
their_site_flk_seqs_to_upids = collections.defaultdict(list[str])
their_site_flk_seqs_to_genes = collections.defaultdict(list[str])
for i, r in their_results.iterrows():
    flk_seq = r["SITE_+/-7_AA"]  # flk_seq.split("|")
    their_site_flk_seqs_to_locs[flk_seq].append(r["Phosphosite"])
    their_site_flk_seqs_to_upids[flk_seq].append(str(i))  # AKA Uniprot ID
    their_site_flk_seqs_to_genes[flk_seq].append(r["Gene"] if not pd.isna(r["Gene"]) else f"?UnipAc:{i}")

# Argsort each of the defaultdicts by the uniprot id
for flk_seq in their_site_flk_seqs_to_locs:
    argst = np.argsort(their_site_flk_seqs_to_upids[flk_seq])
    their_site_flk_seqs_to_locs[flk_seq] = [their_site_flk_seqs_to_locs[flk_seq][i] for i in argst]
    their_site_flk_seqs_to_upids[flk_seq] = [their_site_flk_seqs_to_upids[flk_seq][i] for i in argst]
    their_site_flk_seqs_to_genes[flk_seq] = [their_site_flk_seqs_to_genes[flk_seq][i] for i in argst]

their_sites_to_symbols = {
    flk: f"""{their_site_flk_seqs_to_genes[flk]}|{their_site_flk_seqs_to_upids[flk]}|{their_site_flk_seqs_to_locs[flk]}"""
    for flk in their_site_flk_seqs_to_locs
}

new_idx = [their_sites_to_symbols[r["SITE_+/-7_AA"]] for _, r in their_results.iterrows()]
their_results.index = pd.Index(new_idx)

their_kins_to_symbols = {
    tk: f"['{re.sub(r'_rank', r'', tk)}']|['{matrix_name_to_uniprot_id[re.sub(r'_rank', r'', tk)]}']^Rank^Pctl"
    for tk in [str(x) for x in their_results.columns]
    if tk.endswith("_rank")
}
their_kins_to_symbols_pctls = {
    tk: f"['{re.sub(r'_percentile', r'', tk)}']|['{matrix_name_to_uniprot_id[re.sub(r'_percentile', r'', tk)]}']^Pctl^Pctl"
    for tk in [str(x) for x in their_results.columns]
    if tk.endswith("_percentile") and "median" not in tk
}

their_df_to_compare = their_results.rename(columns=their_kins_to_symbols)[list(their_kins_to_symbols.values())]
their_df_percentiles = their_results.rename(columns=their_kins_to_symbols_pctls)[list(their_kins_to_symbols_pctls.values())]
their_df_percentiles = their_df_percentiles[[str(x).replace("^Rank", "^Pctl") for x in our_ranks.columns]]
their_df_percentiles = their_df_percentiles[~their_df_percentiles.index.duplicated()]
their_df_percentiles.sort_index(axis="index", key=correctly_sort_index, inplace=True)
their_df_percentiles.to_csv("./their_percentiles.csv")

their_ranks = their_df_to_compare[our_ranks.columns].copy()
their_ranks = their_ranks.rank(axis=1, method="max").astype(int)
their_ranks = their_ranks[~their_ranks.index.duplicated()] # TODO: handle repeats better
their_ranks = their_ranks.loc[our_ranks.index]
assert set(our_ranks.index) == set(their_ranks.index), "The set of indices (symbols) are not the same between our ranks and their ranks"
their_ranks.insert(0, column='fake_idx', value = their_ranks.index, allow_duplicates=True)
their_ranks = their_ranks.drop_duplicates(keep='first').drop(columns=['fake_idx'])
their_ranks.sort_index(axis="index", key=correctly_sort_index, inplace=True)
their_ranks.to_csv("./their_ranks.csv")

#### Obtain sample of PSP sites that go along with Atlas kinases (S/T)

In [5]:
SAMPLE_SIZE = 150

with open("/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/data/raw_data/PSP_script_download.xlsx", "rb") as xl:
    pd.Series([re.sub(r"^(.{8})", r"\1*", x.upper()) for x in pd.read_excel(xl)["SITE_+/-7_AA"]]).to_csv("./PSP_site_list.csv", header=False, index=False)
kin_uniprot_to_sites = collections.defaultdict(list[str])
psp = pd.read_excel("/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/data/raw_data/PSP_script_download.xlsx")[["SITE_+/-7_AA", "KIN_ACC_ID"]]
for _, row in psp.iterrows():
    assert isinstance(row["SITE_+/-7_AA"], str)
    kin_uniprot_to_sites[row["KIN_ACC_ID"]].append(re.sub(r"^(.{8})", r"\1*", row["SITE_+/-7_AA"].upper()))
mod_cols = set([matrix_name_to_uniprot_id[re.sub(r"^([0-9A-Z]+)_rank.*", r"\1", x)] for x in their_results.columns if x.endswith("_rank")])

kin_uniprot_to_sites_keys = kin_uniprot_to_sites.copy().keys()
for kin in kin_uniprot_to_sites_keys:
    if kin not in mod_cols:
        kin_uniprot_to_sites.pop(kin)

site_to_kin_uniprots = collections.defaultdict(list[str])
for kin, sites in kin_uniprot_to_sites.items():
    for site in sites:
        if "Y*" not in site:
            site_to_kin_uniprots[site].append(kin)

random.seed(42)
rs = random.sample(sorted(site_to_kin_uniprots.keys()), k=(k:=SAMPLE_SIZE))
with open(os.path.expanduser(f"./kinase_sites_ST_{k}.csv"), "w") as f:
    for s in rs:
        f.write(f"{s}\n")

with open(os.path.expanduser(f"./kinase_sites_ST_{k}.csv"), "r") as f:
    rs = [x.strip() for x in f.readlines()]
    assert all(["Y*" not in x for x in rs])

In [73]:
formatted_df = pd.read_csv('/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/data/raw_data_45176_formatted_65.csv')

In [76]:
hipk2_rows = formatted_df[formatted_df["Original Kinase Gene Name"] == "HIPK2|Q9H2X6"]

In [77]:
hipk2_rows

Unnamed: 0,Original Kinase Gene Name,lab_name,Kinase Gene Name (possibly deranged)/th>,seq,class,num_seqs
8931,HIPK2|Q9H2X6,HIPK2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,AAERPGGTPTPVIAY,1,68
8932,HIPK2|Q9H2X6,HIPK2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,AAPPAPPTPPPPTLP,1,68
8933,HIPK2|Q9H2X6,HIPK2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,ADREAASSPAGEPLR,1,68
8934,HIPK2|Q9H2X6,HIPK2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,AHPPHAPSPGQTVKP,1,68
8935,HIPK2|Q9H2X6,HIPK2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,AHPSQAPSPNQPTKH,1,68
...,...,...,...,...,...,...
40319,HIPK2|Q9H2X6,PRKCA,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,KEEDKKRSSGTPSGG,0,68
43001,HIPK2|Q9H2X6,RPS6KB1,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,RCAMVHSSPACSTSV,0,68
43032,HIPK2|Q9H2X6,RPS6KB2,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,AHPPHAPSPGQTVKP,0,68
44352,HIPK2|Q9H2X6,STK4,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,SDSGTCQSPSLEEPM,0,68


In [41]:
hipk2_rows.to_csv(f"/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/data/raw_data_{len(hipk2_rows)}_formatted_65.csv")

In [70]:
hipk2_rows["seq"].to_csv(f"./site_list_hipk2.csv", header=False, index=False)
hipk2_rows["Kinase Sequence"].drop_duplicates(keep="first").to_csv(f"./kin_list_hipk2.csv", header=False, index=False)
kin_symb_to_grp = pd.read_csv("/Users/druc594/Library/CloudStorage/OneDrive-PNNL/Desktop/DeepKS_/DeepKS/data/preprocessing/kin_to_fam_to_grp_826.csv").set_index("Uniprot").to_dict()['Group']
to_add_group = {
    k: {"Gene Name": [v["Gene Name"].split("|")[0]], "Uniprot Accession ID": [v["Gene Name"].split("|")[1]], "Known Group": [kin_symb_to_grp[v["Gene Name"].split("|")[1]]]}
    for k, v in hipk2_rows.set_index("Kinase Sequence")[~hipk2_rows.set_index("Kinase Sequence").index.duplicated(keep="first")]
    .rename(columns={"Original Kinase Gene Name": "Gene Name"}, inplace=False)
    .to_dict(orient="index")
    .items()
}

# kin_symb_to_grp
with open("kin-info_hipk2.json", "w") as fp:
    json.dump(to_add_group, fp, indent=3)

In [32]:
sites_for_atlas = list(set(hipk2_rows['Site Sequence'].apply(lambda x: (x[:8] + "*" + x[8:]).replace("X", "_")).to_list()))
to_del = set()
for i, x in enumerate(sites_for_atlas):
    if "Y*" in x:
        to_del.add(i)
sites_for_atlas = sorted(list({sites_for_atlas[i] for i in range(len(sites_for_atlas)) if i not in to_del}))
assert all(["Y*" not in x for x in sites_for_atlas])

In [34]:
with open(f"hipk2_sites_for_kin_lib_{len(sites_for_atlas)}.txt", "w") as f:
    f.write("\n".join(sites_for_atlas))