In [1]:
import pandas as pd
import numpy as np
from numpy import nan
from collections import defaultdict 
import ast
import regex as re
import os

In [2]:
def get_rowdict(infile):
    entry_row, symbol_row, name_row, pathway_row = False, False, False, False
    module_row, reaction_row, network_row, disease_row, brite_row, dblinks_row, gene_row, ref_row = False, False, False, False, False, False, False, False
    with open(infile, "r") as f:
        lines = f.readlines()
        for row_ind in range(0, len(lines)):
            line = lines[row_ind]
            line = line.split("  ")
            if line[0] == "ENTRY":
                entry_row = row_ind
            elif line[0] == "SYMBOL":
                symbol_row = row_ind
            elif line[0] == "NAME":
                name_row = row_ind
            elif line[0] == "PATHWAY":
                pathway_row = row_ind
            elif line[0] == "MODULE":
                module_row = row_ind
            elif line[0] == "REACTION":
                reaction_row = row_ind
            elif line[0] == "NETWORK":
                network_row = row_ind
            elif line[0] == "DISEASE":
                disease_row = row_ind
            elif line[0] == "BRITE":
                brite_row = row_ind
            elif line[0] == "DBLINKS":
                dblinks_row = row_ind
            elif line[0] == "GENES":
                gene_row = row_ind
            elif line[0] == "REFERENCE":
                ref_row = row_ind
                break
        rows_dict = {"pathway": pathway_row, "module": module_row, "reaction": reaction_row, "network": network_row,
             "disease": disease_row, "brite": brite_row, "dblinks": dblinks_row, "gene": gene_row, "ref": ref_row}
        return rows_dict

In [3]:
def parse_entry(entry_row, lines, out_dict): # standard rows- 0:1
    key = "ENTRY"
    line_list = lines[entry_row][len(key):].strip().split(" ")
    entry = line_list[0]
    out_dict[key].append(entry)
    return entry
    
def parse_symbol_alias(symbol_row, lines, out_dict): # standard rows- 1:2
    key1, key2 = "SYMBOL", "ALIAS"
    line_list = lines[symbol_row][len(key1):].strip().split(", ")
    symbol = line_list[0]
    if len(line_list) > 1:
        alias = ", ".join(line_list[1:])
    else:
        alias = nan
    out_dict[key1].append(symbol)
    out_dict[key2].append(alias)
    
def parse_name(name_row, lines, out_dict): # standard rows- 2:3
    key = "NAME"
    name_ec = lines[name_row][len(key):].strip().split(" [") #
    name = name_ec[0]
    out_dict[key].append(name)

def parse_pathway_id(lines, row_dict, out_dict): # if it exists, 3:4
    key1, key2 = "PATHWAY_ID", "PATHWAY_NAME"
    if not row_dict["pathway"]:
        out_dict[key1].append(nan)
        out_dict[key2].append(nan)
    else:
        pathway_row = row_dict["pathway"]
        ordered_keys = ["module", "reaction", "network", "disease", "brite", "dblinks", "gene", "ref"]
        ind = 0
        next_section_row = row_dict[ordered_keys[ind]]
        try:
            while not next_section_row:
                ind = ind + 1
                next_section_row = row_dict[ordered_keys[ind]]
        except: # pathway is the last key
            next_section_row = len(lines) - 1 # iterates up to the second to last line
        lines[pathway_row] = lines[pathway_row][7:] # remove "PATHWAY" header
        ids, names = [], []
        for line in lines[pathway_row:next_section_row]:
            path_id, name = line.strip().split("  ")
            ids.append(path_id)
            names.append(name)
        # get two columns, split into multiple rows later
        out_dict[key1].append("DELIMITER?XD".join(ids))
        out_dict[key2].append("DELIMITER?XD".join(names))

In [4]:
# infile = "Data/KO/K00001.txt"
# with open(infile, "r") as f:
#     lines = f.readlines()
#     row_dict = get_rowdict(infile)
#     test_dict = defaultdict(list)
#     entry = parse_entry(0, lines, test_dict)
#     parse_symbol_alias(1, lines, test_dict)
#     parse_name(2, lines, test_dict)
#     parse_pathway_id(lines, row_dict, test_dict)

# df_test = pd.DataFrame(test_dict)

# df_test["PATHWAY_ID"] = df_test["PATHWAY_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
# df_test["PATHWAY_NAME"] = df_test["PATHWAY_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)

# df_test_full = df_test.explode(["PATHWAY_ID", "PATHWAY_NAME"])
# df_test_full


In [6]:
#K26474 requires encoding = latin-1, not default utf-8)
infiles = ["Data/KO/" + file for file in os.listdir("Data/KO")]
infiles.remove("Data/KO/K26474.txt")
infiles.remove("Data/KO/.DS_Store")
infiles.remove("Data/KO/.ipynb_checkpoints")
infiles_1000 = infiles[:1000]

# construct full sized dataframe dictionary
full_out_dict = defaultdict(list)
for infile in infiles_1000:
    with open(infile, "r") as f:
        lines = f.readlines()
        row_dict = get_rowdict(infile)
        entry = parse_entry(0, lines, full_out_dict)
        parse_symbol_alias(1, lines, full_out_dict)
        parse_name(2, lines, full_out_dict)
        parse_pathway_id(lines, row_dict, full_out_dict)

In [10]:
df_summary_full = pd.DataFrame(full_out_dict)

In [13]:
df_summary_full_pathways

Unnamed: 0,ENTRY,SYMBOL,ALIAS,NAME,PATHWAY_ID,PATHWAY_NAME
0,K20411,PRR5,PROTOR,proline-rich protein 5,map04150,mTOR signaling pathway
1,K11814,ebrA,,multidrug resistance protein EbrA,,
2,K26060,gtd2,,glucoside 3-dehydrogenase (cytochrome c) hitch...,,
3,K09083,ATOH1_7,,atonal protein 1/7,,
4,K00919,ispE,,4-diphosphocytidyl-2-C-methyl-D-erythritol kinase,map00900,Terpenoid backbone biosynthesis
...,...,...,...,...,...,...
997,K11785,mqnD,,"1,4-dihydroxy-6-naphthoate synthase",map01100,Metabolic pathways
997,K11785,mqnD,,"1,4-dihydroxy-6-naphthoate synthase",map01110,Biosynthesis of secondary metabolites
997,K11785,mqnD,,"1,4-dihydroxy-6-naphthoate synthase",map01240,Biosynthesis of cofactors
998,K26129,SHLD2,,shieldin complex subunit 2,,


In [12]:
df_summary_full = pd.DataFrame(full_out_dict)
df_summary_full["PATHWAY_ID"] = df_summary_full["PATHWAY_ID"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)
df_summary_full["PATHWAY_NAME"] = df_summary_full["PATHWAY_NAME"].apply(lambda pathway: pathway.split("DELIMITER?XD") if not pd.isnull(pathway) else nan)

df_summary_full_pathways = df_summary_full.explode(["PATHWAY_ID", "PATHWAY_NAME"])
# df_summary_full_pathways.to_csv('kegg_pathways.58345.tsv', sep = "\t", index = False)  


In [195]:
df_summary_full_pathways.head(20).loc[:,"NAME"][7]

'keratinocyte differentiation-associated protein'

 ## start parsing genes

In [14]:
def parse_genes(lines, row_dict, gene_dict, entry):
    gene_row = row_dict["gene"]
    if row_dict["ref"]:
        lines = lines[gene_row:row_dict["ref"]]
    else:
        lines = lines[gene_row:-1] # ignore last line: ///
    lines[0] = lines[0][6:] # strip header
    for line in lines: # for each species
        org, values = line.strip().split(": ")
        values = values.split(" ")
        for value in values:
            try:
                cds_alias = value.split("(")
                cds, alias = cds_alias[0], cds_alias[1][:-1] # strip close parenthesis
            except:
                cds = value
                alias = nan
            gene_dict["ENTRY"].append(entry)
            gene_dict["ALIAS"].append(alias)
            gene_dict["KEGG_CDS"].append(cds)
            gene_dict["KEGG_ORG"].append(org)
    

In [15]:
## test area
# infile = "Data/KO/K00001.txt"
# with open(infile, "r") as f:
#     lines = f.readlines()
#     row_dict = get_rowdict(infile)
#     temp_dict = defaultdict(list)
#     test_gene_dict = defaultdict(list)
#     entry = parse_entry(0, lines, temp_dict)
#     parse_genes(lines, row_dict, test_gene_dict, entry)

In [16]:
infiles_1000[0]

'Data/KO/K20411.txt'

In [23]:
gene_dict = defaultdict(list)
for infile in infiles:
    with open(infile, "r") as f:
        lines = f.readlines()
        row_dict = get_rowdict(infile)
        temp_dict = defaultdict(list)
        entry = parse_entry(0, lines, temp_dict)
        parse_genes(lines, row_dict, gene_dict, entry)

In [45]:
df_genes = pd.DataFrame(gene_dict)
# df_genes["KEGG_ORG"].value_counts()
df_genes.to_csv('kegg_genes.27M.tsv', sep = "\t", index = False)  

### get all kegg organisms to map ncbi ids (python script)
#### complete df_genes in map_genes notebook

In [25]:
with open("Data/kegg_orgs.txt", "a") as outfile:
    for orgid in set(df_genes["KEGG_ORG"]):
        print(orgid, file=outfile)

In [31]:
len(set(df_genes["KEGG_CDS"]))

27040763