In [2]:
import pandas as pd
import numpy as np
from numpy import nan
from collections import defaultdict 
import ast
import regex as re
import os

In [8]:
def get_rowdict(infile):
    entry_row, symbol_row, name_row, pathway_row = False, False, False, False
    module_row, reaction_row, network_row, disease_row, brite_row, dblinks_row, gene_row, ref_row = False, False, False, False, False, False, False, False
    with open(infile, "r") as f:
        lines = f.readlines()
        for row_ind in range(0, len(lines)):
            line = lines[row_ind]
            line = line.split("  ")
            if line[0] == "ENTRY":
                entry_row = row_ind
            elif line[0] == "SYMBOL":
                symbol_row = row_ind
            elif line[0] == "NAME":
                name_row = row_ind
            elif line[0] == "PATHWAY":
                pathway_row = row_ind
            elif line[0] == "MODULE":
                module_row = row_ind
            elif line[0] == "REACTION":
                reaction_row = row_ind
            elif line[0] == "NETWORK":
                network_row = row_ind
            elif line[0] == "DISEASE":
                disease_row = row_ind
            elif line[0] == "BRITE":
                brite_row = row_ind
            elif line[0] == "DBLINKS":
                dblinks_row = row_ind
            elif line[0] == "GENES":
                gene_row = row_ind
            elif line[0] == "REFERENCE":
                ref_row = row_ind
                break
        rows_dict = {"pathway": pathway_row, "module": module_row, "reaction": reaction_row, "network": network_row,
             "disease": disease_row, "brite": brite_row, "dblinks": dblinks_row, "gene": gene_row, "ref": ref_row}
        return rows_dict

In [40]:
def parse_module_id(lines, row_dict, out_dict): # if it exists, 3:4
    key1, key2 = "MODULE_ID", "MODULE_NAME"
    if not row_dict["module"]:
        out_dict[key1].append(nan)
        out_dict[key2].append(nan)
    else:
        module_row = row_dict["module"]
        ordered_keys = ["reaction", "network", "disease", "brite", "dblinks", "gene", "ref"]
        ind = 0
        next_section_row = row_dict[ordered_keys[ind]]
        try:
            while not next_section_row:
                ind = ind + 1
                next_section_row = row_dict[ordered_keys[ind]]
        except: # module is the last key
            next_section_row = len(lines) - 1 # iterates up to the second to last line
        lines[module_row] = lines[module_row][6:] # remove "module_row" header
        ids, names = [], []
        for line in lines[module_row:next_section_row]:
            try:
                module_id, name = line.strip().split("  ")
                ids.append(module_id)
                names.append(name)
            except:
                continue
        # get two columns, split into multiple rows later
        out_dict[key1].append("DELIMITER?XD".join(ids))
        out_dict[key2].append("DELIMITER?XD".join(names))
        
def parse_reaction_id(lines, row_dict, out_dict): # if it exists, 3:4
    key1, key2 = "REACTION_ID", "REACTION_NAME"
    if not row_dict["reaction"]:
        out_dict[key1].append(nan)
        out_dict[key2].append(nan)
    else:
        reaction_row = row_dict["reaction"]
        ordered_keys = ["network", "disease", "brite", "dblinks", "gene", "ref"]
        ind = 0
        next_section_row = row_dict[ordered_keys[ind]]
        try:
            while not next_section_row:
                ind = ind + 1
                next_section_row = row_dict[ordered_keys[ind]]
        except: # reaction is the last key
            next_section_row = len(lines) - 1 # iterates up to the second to last line
        lines[reaction_row] = lines[reaction_row][8:] # remove "reaction_row" header
        ids, names = [], []
        for line in lines[reaction_row:next_section_row]:
            try:
                reaction_id, name = line.strip().split("  ")
                ids.append(reaction_id)
                names.append(name)
            except:
                continue
        # get two columns, split into multiple rows later
        out_dict[key1].append("DELIMITER?XD".join(ids))
        out_dict[key2].append("DELIMITER?XD".join(names))
        
def parse_name(name_row, lines, out_dict): # standard rows- 2:3
    key = "NAME"
    name_ec = lines[name_row][len(key):].strip().split(" [") #
    name = name_ec[0]
    out_dict[key].append(name)
    
def parse_entry(entry_row, lines, out_dict): # standard rows- 0:1
    key = "ENTRY"
    line_list = lines[entry_row][len(key):].strip().split(" ")
    entry = line_list[0]
    out_dict[key].append(entry)
    return entry

In [45]:
#K26474 requires encoding = latin-1, not default utf-8)

#infiles = ["/data/luojaa/KO/" + file for file in os.listdir("/data/luojaa/KO/")]
#infiles.remove("/data/luojaa/KO/K26474.txt")
infiles_1000 = infiles[:1000]

# construct full sized dataframe dictionary
full_out_dict = defaultdict(list)
for infile in infiles:
    with open(infile, "r") as f:
        lines = f.readlines()
        row_dict = get_rowdict(infile)
        entry = parse_entry(0, lines, full_out_dict)
        parse_name(2, lines, full_out_dict)
        parse_module_id(lines, row_dict, full_out_dict)
        parse_reaction_id(lines, row_dict, full_out_dict)

In [48]:
df_summary_full = pd.DataFrame(full_out_dict)

In [59]:
df_summary_full = pd.DataFrame(full_out_dict)
df_summary_full["REACTION_ID"] = df_summary_full["REACTION_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
df_summary_full["REACTION_NAME"] = df_summary_full["REACTION_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
df_summary_full["MODULE_ID"] = df_summary_full["MODULE_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
df_summary_full["MODULE_NAME"] = df_summary_full["MODULE_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)



In [67]:
df_summary_rxn = df_summary_full.explode(["REACTION_ID", "REACTION_NAME"]).loc[:,["ENTRY", "REACTION_ID", "REACTION_NAME"]]
df_summary_module = df_summary_full.explode(["MODULE_ID", "MODULE_NAME"]).loc[:,["ENTRY", "MODULE_ID", "MODULE_NAME"]]
df_summary_rxn.to_csv('/data/luojaa/kegg/kegg_reactions.tsv', sep = "\t", index = False)  
df_summary_module.to_csv('/data/luojaa/kegg/kegg_modules.tsv', sep = "\t", index = False)  
