In [1]:
import pandas as pd
import numpy as np
from numpy import nan
from collections import defaultdict 
import ast
import regex as re
import os

In [2]:
def get_rowdict(infile):
    entry_row, symbol_row, name_row, pathway_row = False, False, False, False
    module_row, reaction_row, network_row, disease_row, brite_row, dblinks_row, gene_row, ref_row = False, False, False, False, False, False, False, False
    with open(infile, "r") as f:
        lines = f.readlines()
        for row_ind in range(0, len(lines)):
            line = lines[row_ind]
            line = line.split("  ")
            if line[0] == "ENTRY":
                entry_row = row_ind
            elif line[0] == "SYMBOL":
                symbol_row = row_ind
            elif line[0] == "NAME":
                name_row = row_ind
            elif line[0] == "PATHWAY":
                pathway_row = row_ind
            elif line[0] == "MODULE":
                module_row = row_ind
            elif line[0] == "REACTION":
                reaction_row = row_ind
            elif line[0] == "NETWORK":
                network_row = row_ind
            elif line[0] == "DISEASE":
                disease_row = row_ind
            elif line[0] == "BRITE":
                brite_row = row_ind
            elif line[0] == "DBLINKS":
                dblinks_row = row_ind
            elif line[0] == "GENES":
                gene_row = row_ind
            elif line[0] == "REFERENCE":
                ref_row = row_ind
                break
        rows_dict = {"pathway": pathway_row, "module": module_row, "reaction": reaction_row, "network": network_row,
             "disease": disease_row, "brite": brite_row, "dblinks": dblinks_row, "gene": gene_row, "ref": ref_row}
        return rows_dict

In [3]:
def parse_module_id(lines, row_dict, out_dict): # if it exists, 3:4
    key1, key2 = "MODULE_ID", "MODULE_NAME"
    if not row_dict["module"]:
        out_dict[key1].append(nan)
        out_dict[key2].append(nan)
    else:
        module_row = row_dict["module"]
        ordered_keys = ["reaction", "network", "disease", "brite", "dblinks", "gene", "ref"]
        ind = 0
        next_section_row = row_dict[ordered_keys[ind]]
        try:
            while not next_section_row:
                ind = ind + 1
                next_section_row = row_dict[ordered_keys[ind]]
        except: # module is the last key
            next_section_row = len(lines) - 1 # iterates up to the second to last line
        lines[module_row] = lines[module_row][6:] # remove "module_row" header
        ids, names = [], []
        for line in lines[module_row:next_section_row]:
            try:
                module_id, name = line.strip().split("  ")
                ids.append(module_id)
                names.append(name)
            except:
                continue
        # get two columns, split into multiple rows later
        out_dict[key1].append("DELIMITER?XD".join(ids))
        out_dict[key2].append("DELIMITER?XD".join(names))
        
def parse_reaction_id(lines, row_dict, out_dict): # if it exists, 3:4
    key1, key2 = "REACTION_ID", "REACTION_NAME"
    if not row_dict["reaction"]:
        out_dict[key1].append(nan)
        out_dict[key2].append(nan)
    else:
        reaction_row = row_dict["reaction"]
        ordered_keys = ["network", "disease", "brite", "dblinks", "gene", "ref"]
        ind = 0
        next_section_row = row_dict[ordered_keys[ind]]
        try:
            while not next_section_row:
                ind = ind + 1
                next_section_row = row_dict[ordered_keys[ind]]
        except: # reaction is the last key
            next_section_row = len(lines) - 1 # iterates up to the second to last line
        lines[reaction_row] = lines[reaction_row][8:] # remove "reaction_row" header
        ids, names = [], []
        for line in lines[reaction_row:next_section_row]:
            try:
                reaction_id, name = line.strip().split("  ")
                ids.append(reaction_id)
                names.append(name)
            except:
                continue
        # get two columns, split into multiple rows later
        out_dict[key1].append("DELIMITER?XD".join(ids))
        out_dict[key2].append("DELIMITER?XD".join(names))
        
def parse_name(name_row, lines, out_dict): # standard rows- 2:3
    key = "NAME"
    name_ec = lines[name_row][len(key):].strip().split(" [") #
    name = name_ec[0]
    out_dict[key].append(name)
    
def parse_entry(entry_row, lines, out_dict): # standard rows- 0:1
    key = "ENTRY"
    line_list = lines[entry_row][len(key):].strip().split(" ")
    entry = line_list[0]
    out_dict[key].append(entry)
    return entry

In [87]:
def parse_values(start_row, end_row, lines, out_dict):
    key = lines[start_row].split(" ")[0]
    lines[start_row] = lines[start_row][len(key):]
    entry = []
    for line in lines[start_row:end_row]:
        line = line.strip().split("  ")
        line = [str for str in line if len(str) > 0]
        entry += line
    out_dict[key].append(entry)
    
def parse_values2dict(start_row, end_row, lines, out_dict):
    if not start_row:
        return
    elif not end_row:
        key = lines[start_row].split(" ")[0]
    lines[start_row] = lines[start_row][len(key):]
    value_dict = {}
    for line in lines[start_row:end_row]:
        line = line.strip().split("  ")
        line = [str for str in line if len(str) > 0]
        key2, val2 = line[0], line[1]
        value_dict[key2] = val2
    out_dict[key].append(value_dict)
    
def remove_innermost_brackets(dict_str):
    return re.sub(r"\{([^{}]+)\}", r"\1", dict_str)

def parse_json(row_dict, lines, out_dict): # construct a json string and convert to dictionary
    key = "BRITE"
    if not row_dict["brite"]:
        out_dict[key].append(nan)
    else:
        start_row = row_dict["brite"]
        ordered_keys = ["dblinks", "gene", "ref"]
        ind = 0
        end_row = row_dict[ordered_keys[ind]]
        try:
            while not end_row:
                ind = ind + 1
                end_row = row_dict[ordered_keys[ind]]
        except: # brite is the last key
            end_row = len(lines) - 1 # iterates up to the second to last line
            
        #key = lines[start_row].split(" ")[0]
        lines[start_row] = lines[start_row][len(key):]

        base_headsize = len(lines[start_row]) - len(lines[start_row].lstrip()) + len(key) # get the number of characters preceding first json entry
        prev_level = 0
        dict_str = '{"' + lines[start_row].strip() + '"'
        for line in lines[start_row + 1:end_row]:
            cur_headsize = len(line) - len(line.lstrip())
            level = cur_headsize - base_headsize # level 0 would be base, level 1 would be 1 indent, so on .. levels always change for the BRITEfield
            #print(level)
            line = line.strip().replace("\"", "~double~").replace("\'", "~single~") # fix parsing bug
            if level > prev_level:
                dict_str += ': {"' + line + '"'
            else:
                ascents = prev_level - level
                dict_str += '}'*ascents + ', "' + line + '"'
            prev_level = level
        dict_str += '}'*(level + 1)
        dict_str_clean = remove_innermost_brackets(dict_str)
        entry = ast.literal_eval(dict_str)
        out_dict[key].append(entry)


In [63]:
dict_str_test = """
{"KEGG Orthology (KO) [BR:ko00001]": {"09180 Brite Hierarchies": {"09183 Protein families: signaling and cellular processes": {"01504 Antimicrobial resistance genes": {"K00984  aadA; streptomycin 3"-adenylyltransferase"}}}}, "Enzymes [BR:ko01000]": {"2. Transferases": {"2.7  Transferring phosphorus-containing groups": {"2.7.7  Nucleotidyltransferases": {"2.7.7.47  streptomycin 3''-adenylyltransferase": {"K00984  aadA; streptomycin 3"-adenylyltransferase"}}}}}, "Antimicrobial resistance genes [BR:ko01504]": {"Gene variants": {"Aminoglycoside resistance genes": {"O-Nucleotidyltransferases": {"K00984  aadA; streptomycin 3"-adenylyltransferase"}}}}}
""".replace("3\"", "doubleprime").replace("3'", "3prime")
ast.literal_eval(dict_str_test)

{'KEGG Orthology (KO) [BR:ko00001]': {'09180 Brite Hierarchies': {'09183 Protein families: signaling and cellular processes': {'01504 Antimicrobial resistance genes': {'K00984  aadA; streptomycin doubleprime-adenylyltransferase'}}}},
 'Enzymes [BR:ko01000]': {'2. Transferases': {'2.7  Transferring phosphorus-containing groups': {'2.7.7  Nucleotidyltransferases': {"2.7.7.47  streptomycin 3prime'-adenylyltransferase": {'K00984  aadA; streptomycin doubleprime-adenylyltransferase'}}}}},
 'Antimicrobial resistance genes [BR:ko01504]': {'Gene variants': {'Aminoglycoside resistance genes': {'O-Nucleotidyltransferases': {'K00984  aadA; streptomycin doubleprime-adenylyltransferase'}}}}}

In [75]:
len(full_out_dict["BRITE"])

26440

In [88]:
#K26474 requires encoding = latin-1, not default utf-8)

infiles = ["/data/luojaa/KO/" + file for file in os.listdir("/data/luojaa/KO/")]
infiles.remove("/data/luojaa/KO/K26474.txt")
infiles_1000 = infiles[:1000]

# construct full sized dataframe dictionary
full_out_dict = defaultdict(list)
for infile in infiles:
    with open(infile, "r") as f:
        lines = f.readlines()
        row_dict = get_rowdict(infile)
        entry = parse_entry(0, lines, full_out_dict)
#         parse_name(2, lines, full_out_dict)
#         parse_module_id(lines, row_dict, full_out_dict)
#         parse_reaction_id(lines, row_dict, full_out_dict)
        parse_json(row_dict, lines, full_out_dict)
    

In [79]:
full_out_dict["ENTRY"] = full_out_dict2["ENTRY"]

In [85]:
type({}) == dict

True

In [111]:
def jsondict2list(jdict):
    rv = []
    if type(jdict) != dict:
        return []
    for key in list(jdict.keys()):
        rv += [key]
        rv += jsondict2list(jdict[key])
    return rv

In [117]:
def jd2l_wrapper(jdict):
    d = jdict["KEGG Orthology (KO) [BR:ko00001]"]
    rv = jsondict2list(d)
    rv = list(set(rv))
    rv = [i.replace("~double~", "\"").replace("~single~", "\'") for i in rv]
    return rv

In [114]:
d = full_out_dict["BRITE"][0]["KEGG Orthology (KO) [BR:ko00001]"]
jsondict2list(d)

['09100 Metabolism',
 '09101 Carbohydrate metabolism',
 '00010 Glycolysis / Gluconeogenesis',
 '00620 Pyruvate metabolism',
 '09103 Lipid metabolism',
 '00071 Fatty acid degradation',
 '09105 Amino acid metabolism',
 '00350 Tyrosine metabolism',
 '09108 Metabolism of cofactors and vitamins',
 '00830 Retinol metabolism',
 '09111 Xenobiotics biodegradation and metabolism',
 '00625 Chloroalkane and chloroalkene degradation',
 '00626 Naphthalene degradation',
 '00980 Metabolism of xenobiotics by cytochrome P450',
 '00982 Drug metabolism - cytochrome P450']

In [147]:
df_summary_full = pd.DataFrame(full_out_dict)

In [148]:
df_summary_full["list"] = df_summary_full.BRITE.apply(jd2l_wrapper)

In [149]:
def extract_brite_id(lst):
    rv = []
    for i in lst:
        rv.append(i.split(" ")[0])
    return rv
def extract_brite_desc(lst):
    rv = []
    for i in lst:
        split = i.split(" ")
        desc = " ".join(split[1:])
        rv.append(desc)
    return rv

In [150]:
df_summary_full["brite_ids"] = df_summary_full.list.apply(extract_brite_id)
df_summary_full["brite_descriptions"] = df_summary_full.list.apply(extract_brite_desc)

In [164]:
kog2briteids = df_summary_full.loc[:,["ENTRY"]]
kog2briteids["brite_ids"] = df_summary_full.brite_ids.apply(lambda x: "|".join(x))

In [167]:
kog2briteids.to_csv("/data/luojaa/kegg_stats/kog2briteids.tsv", sep = "\t", index = None)

In [170]:
brite_mappings = df_summary_full.explode(["brite_ids", "brite_descriptions"]).loc[:,["brite_ids","brite_descriptions"]].drop_duplicates().sort_values("brite_ids")

In [171]:
brite_mappings.to_csv("/data/luojaa/kegg_stats/kog_brite_mappings.tsv", sep = "\t", index = None)

In [59]:
# df_summary_full = pd.DataFrame(full_out_dict)
# df_summary_full["REACTION_ID"] = df_summary_full["REACTION_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
# df_summary_full["REACTION_NAME"] = df_summary_full["REACTION_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
# df_summary_full["MODULE_ID"] = df_summary_full["MODULE_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
# df_summary_full["MODULE_NAME"] = df_summary_full["MODULE_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)



In [67]:
# df_summary_rxn = df_summary_full.explode(["REACTION_ID", "REACTION_NAME"]).loc[:,["ENTRY", "REACTION_ID", "REACTION_NAME"]]
# df_summary_module = df_summary_full.explode(["MODULE_ID", "MODULE_NAME"]).loc[:,["ENTRY", "MODULE_ID", "MODULE_NAME"]]
# df_summary_rxn.to_csv('/data/luojaa/kegg/kegg_reactions.tsv', sep = "\t", index = False)  
# df_summary_module.to_csv('/data/luojaa/kegg/kegg_modules.tsv', sep = "\t", index = False)  
