In [1]:
# Imports and formatting

import pandas as pd, numpy as np, re, os, sys, collections, requests, asyncio, aiohttp, itertools, tqdm, time
from pprint import pprint
np.set_printoptions(precision=3, edgeitems=10, linewidth=180)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 180)
pd.set_option('display.min_rows', 50)
UNIPROT_REQUEST_SIZE = 400

In [2]:
# Get the phosphosite data into appropriate format
all_phos_sites_to_loc = collections.defaultdict(list[str])
base_series: list[str] = pd.read_csv("phosphosites_base.txt", sep="\t")['feature_names'].tolist()

for x in base_series:
    all_phos_sites_to_loc[x.split("-")[0]] += sorted(x.split("-")[1].split(","))

for k in all_phos_sites_to_loc.keys():
    all_phos_sites_to_loc[k] = sorted(list(set(all_phos_sites_to_loc[k])))

all_phos_sites_to_loc = collections.OrderedDict(sorted(all_phos_sites_to_loc.items(), key=lambda x: x[0]))

In [None]:
# Make API requests to Uniprot to get the sequence of each protein

organism = "9606"

query_part_1 = f"https://rest.uniprot.org/uniprotkb/stream?format=fasta&uncompressed=true&query=reviewed:true+AND+organism_id:{organism}+AND+"
gene_queries = [
    query_part_1
    + "("
    + "+OR+".join([f"gene:{x}" for x in list(all_phos_sites_to_loc.keys())[i : i + UNIPROT_REQUEST_SIZE]])
    + ")"
    for i in range(0, len(all_phos_sites_to_loc), UNIPROT_REQUEST_SIZE)
]


In [None]:
from async_timeout import timeout

MAX_TRIES = 4

async def get_url(url, session: aiohttp.ClientSession):
    done = False
    tries = 0
    while not done and tries < MAX_TRIES:
        try:
            print("About to request.")
            async with session.get(url, timeout = 10) as r:
                if r.status != 200:
                    print("---- RESPONSE TEXT -------")
                    print(await r.text())
                    print("--------------------------")
                    raise requests.HTTPError(f"Request failed with status code {r.status}. Response text above.")
                return await r.text()
                
        except Exception as e:
            print(str(e.__class__.__name__) + ":", e)
            tries += 1
            print(f"Retrying in {tries*10} seconds.")
    raise RuntimeError(f"Failed to get URL after {MAX_TRIES} tries.")

In [None]:
MAX_CONNECTIONS = 1
SLEEP = 1

fasta_pages = []
async with aiohttp.ClientSession() as session:
    for i in tqdm.tqdm(range(0, len(gene_queries), MAX_CONNECTIONS)):
        if i != 0:
            time.sleep(SLEEP)
        fasta_page = await asyncio.gather(*[get_url(url, session) for url in gene_queries[i : i + MAX_CONNECTIONS]])
        fasta_pages += fasta_page

In [None]:
all_fasta_string = "".join(fasta_pages)

In [None]:
names = re.findall(r"GN=([^\s]+)", all_fasta_string)
sequences = [x.replace("\n", "") for x in re.findall(r">.*\n([^>]+)", all_fasta_string)]
ids = re.findall(r">.*?\|(.*?)\|", all_fasta_string)
names_long = re.findall(r">.*?\|.*?\|(.*?) OS=", all_fasta_string)
assert len(names) == len(sequences) == len(ids) == len(names), "Lengths of fasta information are not equal."

sequences_table = pd.DataFrame({"Gene Name": names, "Sequence": sequences, "Uniprot ID": ids, "Name": names_long})
sequences_table['Symbol'] = sequences_table['Gene Name'] + "|" + sequences_table['Uniprot ID']
sequences_table.to_csv("sequences_table.csv", index=False)

In [3]:
sequences_table = pd.read_csv("sequences_table.csv")

In [4]:
gene_name_to_uniprot = sequences_table.set_index("Gene Name").to_dict()["Uniprot ID"]
gene_name_to_sequence = sequences_table.set_index("Gene Name").to_dict()["Sequence"]

In [5]:
count = 0
for g in all_phos_sites_to_loc.keys():
    if g not in gene_name_to_sequence:
        count += 1
        print(f"{count}. WARNING: Gene {g} not found in reviewed Uniprot database.")



In [6]:
FLANKING_OFFSET = 7

symbol_to_flanking_sites = collections.defaultdict(list[str])
for gene_name in all_phos_sites_to_loc.keys():
    if gene_name in gene_name_to_sequence:
        sequence = gene_name_to_sequence[gene_name]
        for site in all_phos_sites_to_loc[gene_name]:
            middle = int(site[1:]) if site[1:].isnumeric() else -1
            if middle == -1:
                continue
            if middle - FLANKING_OFFSET - 1 < 0:
                l = 0
                lpart = "X"*abs(middle - FLANKING_OFFSET - 1)
            else:
                l = middle - FLANKING_OFFSET - 1
                lpart = ""
            if middle + FLANKING_OFFSET > len(sequence):
                r = len(sequence)
                rpart = "X"*(middle + FLANKING_OFFSET - len(sequence))
            else:
                r = middle + FLANKING_OFFSET
                rpart = ""
            symbol_to_flanking_sites[gene_name + "|" + gene_name_to_uniprot[gene_name]].append(lpart+sequence[l:r]+rpart)

site_to_site_id = {s: i for i, s in enumerate(list(itertools.chain(*list(symbol_to_flanking_sites.values()))))}

In [86]:
relevant_kinase_symbols = pd.read_csv("/home/ubuntu/DeepKS/data/raw_data/raw_data_22588.csv")
relevant_kinase_symbols = relevant_kinase_symbols[relevant_kinase_symbols['organism'] == 'HUMAN']
relevant_kinase_symbols = (relevant_kinase_symbols['lab'] + "|" + relevant_kinase_symbols['uniprot_id']).unique().tolist()

In [None]:
ks = pd.read_csv("/home/ubuntu/DeepKS/data/raw_data/kinase_seq_826.txt")
kinase_symbol_to_kinase_sequence = {ksymb: kseq for ksymb, kseq in zip(ks['gene_name'] + "|" + ks['kinase'], ks['kinase_seq'])}
kinase_list = [kinase_symbol_to_kinase_sequence[x] for x in relevant_kinase_symbols]
site_list = site_to_site_id.keys()

with open("site_list.txt", "w") as f, open("kinase_list.txt", "w") as g:
    f.write("\n".join(site_list))
    g.write("\n".join(kinase_list))


# site_X_kinase = [(x[0], kinase_symbol_to_kinase_sequence[x[1]]) for x in site_X_kinase_symbol]
# site_list = [x[0] for x in site_X_kinase]
# kinase_list = [x[1] for x in site_X_kinase]