In [None]:
import json
from typing import Dict, List, Tuple
from urllib.request import urlopen 
import regex as re
import tqdm
import pandas as pd

In [None]:
# Read json from url example

def read_json(url :str):

    read_json_from_url = urlopen(url)
    data_json = json.loads(read_json_from_url.read()) 
    return data_json


def read_json_to_list_of_proteins(path : str):
    """Reads json file and returns a dictionary
    Example usage:
        proteins = read_json_to_list_of_proteins("uniprotkb_AND_reviewed_true_2023_10_30.json")
    """
    uniprot_json = json.load(open(path))    
    proteins = uniprot_json["results"]
    return proteins


def clean_domain(domain : str):
    domain = re.sub(r'\s*\(Probable\)', '', domain)
    return domain


def remove_pubchem_med_from_text(string : str):
    pattern = r'\(PubMed:\d+(?:,\s*PubMed:\d+)*\)'
    string = re.sub(pattern, '', string)
    return string

In [None]:
clean_domain("test some regex (Probable) some text")

In [None]:
example = read_json("https://rest.uniprot.org/uniprotkb/A0A009IHW8.json")
example["sequence"]["value"]

### Example

In [44]:
CONSIDERED_COMMENTS = [
    "CATALYTIC ACTIVITY",
    "FUNCTION",
    "DOMAIN",
    "BINDING SITES"
]


def extract_catalyzed_reactions(json_dict : Dict) -> Tuple[List, List, List]:
    # Define lists
    sentences : List = []
    reactions : List = []
    domains   : List = []
    
    comments = json_dict["comments"]
    for i in range(len(comments)):
        if comments[i]["commentType"] == "FUNCTION":
            sentence_description = json_dict["comments"][i]["texts"][0]["value"]
            sentence_description = remove_pubchem_med_from_text(sentence_description)
            sentences.append(sentence_description)
        if comments[i]["commentType"] == "CATALYTIC ACTIVITY":    
            catalyzed_reaction = json_dict["comments"][i]["reaction"]["name"]
            ### Replace = with -> for latex
            catalyzed_reaction = catalyzed_reaction.replace("=", "->")
            reactions.append(catalyzed_reaction)
        if comments[i]["commentType"] == "DOMAIN":
            domain = json_dict["comments"][i]["texts"][0]["value"]
            domain = remove_pubchem_med_from_text(domain)
            domains.append(domain)

    return list(set(sentences)), list(set(reactions)), list(set(domains))


def extract_organisms(json_dict : Dict):
    scientificName = json_dict["organism"]["scientificName"]
    return scientificName

def extract_binding_sites(json_dict : Dict):
    binding_sites = [i["location"]["start"]["value"] for i in json_dict["features"] if i["type"] == "Binding site"] 
    return binding_sites


def extract_molecular_function(json_dict : Dict):
    pass

def extract_all(json_dict):
    # try:
    sequence = json_dict["sequence"]["value"]
    sentences, reactions, domains = extract_catalyzed_reactions(json_dict)
    binding_sites = extract_binding_sites(json_dict)
    organisms = extract_organisms(json_dict)
    return {"sequence" : sequence,
            "reactions" : '|'.join([str(i) for i in reactions]), 
            "sentences" : sentences,
            "organisms" : organisms,
            "domains" : '|'.join([str(i) for i in domains]),
            "binding sites" : ','.join([str(i) for i in binding_sites])
    }
    # except KeyError:
    #     print("err")

### Run example

In [58]:
extract_all(proteins[348])["binding sites"]

'328,328,332,332,472,476,480'

### Full dataset parsing

In [None]:
proteins = read_json_to_list_of_proteins("uniprotkb_AND_reviewed_true_2023_10_30.json")

In [None]:
data = []

for prot_id, prot_seq_json in enumerate(tqdm.tqdm(proteins)):
    try:
        data.append({prot_id : extract_all(prot_seq_json)})
    except Exception:
        pass

In [None]:
df = pd.DataFrame(data)

In [59]:
# data = dict(ChainMap(*data))
a = {}

for seq in data:
    a[list(seq.keys())[0]] = list(seq.values())[0]

reactions = [a[i]["reactions"] for i in list(a.keys())]
sequences = [a[i]["sequence"] for i in list(a.keys())]
sentences = [a[i]["sentences"] for i in list(a.keys())]
domains = [a[i]["domains"] for i in list(a.keys())]
organisms = [a[i]["organisms"] for i in list(a.keys())]
binding_sites = [a[i]["binding sites"] for i in list(a.keys())]


In [61]:
df = pd.DataFrame()

df["sequence"] = sequences
df["sentences"] = sentences
df["reactions"] = reactions
df["domains"] = domains
df["organisms"] = organisms
df["binding_sites"] = binding_sites

In [62]:
df.to_csv("reactions_sentences_domains_organisms_binding_sites.csv")


In [None]:
organisms