In [1]:
import json
from typing import Dict
from urllib.request import urlopen 
import regex as re

In [2]:
# Read json from url example

def read_json(url :str):

    read_json_from_url = urlopen(url)
    data_json = json.loads(read_json_from_url.read()) 
    return data_json


def read_json_to_list_of_proteins(path : str):
    """Reads json file and returns a dictionary
    Example usage:
        proteins = read_json_to_list_of_proteins("uniprotkb_AND_reviewed_true_2023_10_30.json")
    """
    uniprot_json = json.load(open(path))    
    proteins = uniprot_json["results"]
    return proteins


def remove_pubchem_med_from_text(string : str):
    pattern = r'\(PubMed:\d+(?:,\s*PubMed:\d+)*\)'
    string = re.sub(pattern, '', string)
    return string

In [3]:
example = read_json("https://rest.uniprot.org/uniprotkb/A0A009IHW8.json")
example["sequence"]["value"]

'MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR'

### Example

In [12]:
CONSIDERED_COMMENTS = [
    "CATALYTIC ACTIVITY",
    "FUNCTION"
    "DOMAIN"
]


def extract_catalyzed_reactions(json_dict : Dict):
    # Define lists
    sentences : List = []
    reactions : List = []
    
    comments = json_dict["comments"]
    for i in range(len(comments)):
        if comments[i]["commentType"] == "FUNCTION":
            sentence_description = json_dict["comments"][i]["texts"][i]["value"]
            sentence_description = remove_pubchem_med_from_text(sentence_description)
            sentences.append(sentence_description)
        if comments[i]["commentType"] == "CATALYTIC ACTIVITY":    
            catalyzed_reaction = json_dict["comments"][i]["reaction"]["name"]
            ### Replace = with -> for latex
            catalyzed_reaction = catalyzed_reaction.replace("=", "->")
            reactions.append(catalyzed_reaction)
    return list(set(sentences)), list(set(reactions))


def extract_binding_sites(json_dict : Dict):
    binding_sites = [i["location"]["start"]["value"] for i in example["features"] if i["type"] == "Binding site"] 
    return binding_sites


def extract_all(json_dict):
    # try:
    sequence = json_dict["sequence"]["value"]
    sentences, reactions = extract_catalyzed_reactions(json_dict)
    binding_sites = extract_binding_sites(json_dict)
    return {"sequence" : {sequence : {
            "reactions" : '|'.join([str(i) for i in reactions]), 
            "binding sites" : ','.join([str(i) for i in binding_sites])
    }}}
    # except KeyError:
    #     print("err")

In [13]:
extract_all(example)

{'sequence': {'MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR': {'reactions': "NAD(+) -> 2'cADPR + H(+) + nicotinamide|H2O + NADP(+) -> ADP-D-ribose 2'-phosphate + H(+) + nicotinamide|H2O + NAD(+) -> ADP-D-ribose + H(+) + nicotinamide",
   'binding sites': '143,172,202,245'}}}

In [None]:
proteins = read_json_to_list_of_proteins("uniprotkb_AND_reviewed_true_2023_10_30.json")


In [None]:
data = []

for prot_seq_json in proteins:
    data.append(extract_all(prot_seq_json))