# KEGG data

To get relevant data the following needs to be done:

* Download compound, reaction and glycan objects `get_KEGG.R` 
* Download glycan-to-compound links using `wget http://rest.kegg.jp/link/cpd/gl -O "gl-to-cpd-api.txt"`

## Load modules

In [1]:
import re
import pandas as pd
from numpy import NaN as NaN
import json
from collections import defaultdict
from bs4 import BeautifulSoup

## File paths

In [2]:
!pwd -P

/home/jovyan/work/make_graph


In [3]:
# file paths in
reactions_file_name       = '/home/jovyan/data/KEGG/KEGG_reactions.tsv'
compounds_file_name       = '/home/jovyan/data/KEGG/KEGG_compounds.tsv'
glycans_file_name         = '/home/jovyan/data/KEGG/KEGG_glycans.tsv'

glycans_link_name         = '/home/jovyan/data/KEGG/gl-to-cpd-api.txt'
compounds_brite_file_name = '/home/jovyan/data/KEGG/br08001.json'
metacyc_file_name         = '/home/jovyan/data/KEGG/MetaCyc-Directions.txt'

In [4]:
# file paths out
compound_entities_file      = '/home/jovyan/data/import/KEGG_compound_entities.tsv'
reaction_entities_file      = '/home/jovyan/data/import/KEGG_reaction_entities.tsv'
product_relationship_file   = '/home/jovyan/data/import/KEGG_relationship_PRODUCT.tsv'
substrate_relationship_file = '/home/jovyan/data/import/KEGG_relationship_SUBSTRATE.tsv'

## Nonspecific functions

In [5]:
def list_to_string(x):
    return ",".join([str(i) for i in x])

## Reaction vertices

In [6]:
def detag(x):
    try:
        soup = BeautifulSoup(x, "html5lib")
        return list(set([x.strip('\"').upper() for x in soup.text.split(" // ")]))
    except Exception as e:
        return [] 

In [7]:
df_reactions = pd.read_csv(reactions_file_name, sep="\t", na_values=[": NULL"])

In [8]:
# Create synonym list
df_reactions["SYNONYMS"] = [df_reactions["NAME"][i].split("; ") if df_reactions["NAME"][i] is not NaN
                        else [df_reactions["ENTRY"][i]] 
                        for i in range(df_reactions.shape[0])]
pd.isnull(df_reactions["SYNONYMS"]).sum()

0

In [9]:
# replace name
df_reactions["NAME"] = df_reactions["SYNONYMS"].apply(lambda x: '"%s"'%x[0].replace('"', "'"))

In [10]:
# split enzymes
df_reactions['ENZYME'] = df_reactions['ENZYME'].apply(lambda x: ["%s"%t for t in re.split("\s+", x)] 
                                                      if x is not NaN 
                                                      else [])

In [11]:
# get the pathway ids 
df_reactions["PATHWAY"] = [re.findall(r'rn[0-9]*', s) if s is not NaN 
                           else [] 
                           for s in df_reactions["PATHWAY"]]

In [12]:
# double quotes for the rest
df_reactions['EQUATION'] = df_reactions['EQUATION'].apply(lambda x: '"%s"'%x)
df_reactions['DEFINITION'] = df_reactions['DEFINITION'].apply(lambda x: '"%s"'%x.strip().replace('"', "'"))

In [13]:
# all lists to strings
df_reactions["PATHWAY"] = df_reactions["PATHWAY"].apply(list_to_string)
df_reactions["SYNONYMS"] = df_reactions["SYNONYMS"].apply(list_to_string)
df_reactions["ENZYME"] = df_reactions["ENZYME"].apply(list_to_string)

In [14]:
df_reactions["DIRECTION_FROM_METACYC"] = None

In [15]:
df_reactions = df_reactions[["ENTRY", "NAME", "SYNONYMS", "DEFINITION", "EQUATION", "ENZYME", "PATHWAY", "DIRECTION_FROM_METACYC"]]

In [16]:
df_reactions.columns = ['ID', 
                        'NAME',
                        'SYNONYMS',
                        'NAME_EQUATION', 
                        'EQUATION', 
                        'EC_NUMBERS', 
                        'PATHWAY', 
                        'DIRECTION_FROM_METACYC']

In [17]:
df_reactions.set_index("ID", drop=False, inplace=True)

### Direction from metacyc

In [18]:
manually_checked_same = ["R00026", "R00028", "R00034", "R00544", "R01501", "R00874", "R01101", 
                         "R01263", "R01369", "R01483", "R01484", "R01498", 
                         "R01645", 
                         "R02172", "R02183", "R02185", "R02341", "R02361", "R02465", "R02782", 
                         "R02816", "R02826", "R02963", "R02568", "R02602", "R02714", "R02716", 
                         "R02852", "R03020", "R03032", "R02287", "R03189", "R03204", "R09125", 
                         "R03246", "R03253", "R03256", "R03386", "R03519", "R03617", "R03677", 
                         "R06392", "R09398", "R09709", "R10209", "R10278", "R10469", "R10867", 
                         "R09967", "R09977", "R10029", "R10033", "R10051", "R09609", "R03789", 
                         "R03898", "R03959", "R04061", "R04081", "R04089", "R04104", "R04527"]

manually_checked_reversed = ["R07790", "R02300"]

manually_checked_different = ["R02718", "R10331", "R10667"]

In [19]:
metacyc_df = pd.read_csv(metacyc_file_name, sep="\t", 
                         usecols=["Reaction", "Reaction-Direction", "KEGG-RIGHT", "KEGG-LEFT", "KEGG"])
metacyc_df.dropna(subset=["KEGG", "Reaction-Direction"], inplace=True)
metacyc_df["KEGG"] = metacyc_df["KEGG"].apply(detag)

b = pd.DataFrame(metacyc_df["KEGG"].tolist(), index=metacyc_df.index).stack()
b.name = "KEGG-RXN"
b.index = b.index.droplevel(1)

metacyc_df = metacyc_df.join(b)
metacyc_df.set_index("KEGG-RXN", inplace=True)
metacyc_df.sort_index(inplace=True)

In [20]:
# remove different reactions
metacyc_df.drop(manually_checked_different, inplace=True)

In [21]:
metacyc_df.sort_index(inplace=True)

In [22]:
# compounds mislinked in metacyc or tautomers/less/more specific
D = {"C00890":"C00404", "C00221":"C00031", "C00267":"C00031", 
     "C01342":"C00014", "C14823":"C04822", "C00962":"C00124", 
     "C02917":"C00583", "C01011":"C00904", "C00093":"C03189", 
     "D07349":"C00670", "C16073":"C02658", "C16072":"C00726", 
     "C02171":"C01117", "C00282":"C00030", "C04434":"C04324", 
     "C04548":"C01869"}
def replace(l):
    return l + [D[c] for c in l if c in D]  

metacyc_df["LEFT"] = metacyc_df["KEGG-LEFT"].apply(detag).apply(replace)
metacyc_df["RIGHT"] = metacyc_df["KEGG-RIGHT"].apply(detag).apply(replace)

del metacyc_df["KEGG-LEFT"]
del metacyc_df["KEGG-RIGHT"]

In [23]:
def is_same(df):
    return len(df.unique()) == 1

In [24]:
# find duplicates by KEGG ID
dups_idx = metacyc_df[metacyc_df.index.duplicated(False)]["Reaction-Direction"].groupby(level=0).aggregate(is_same)
dups = metacyc_df.loc[dups_idx[dups_idx==True].index]

# delete all rows that don't agree on direction
metacyc_df.drop(dups_idx[dups_idx == False].index, inplace=True)

In [25]:
# replace aggregated rows that agree on direction
dups = dups.groupby(level=0).aggregate({"Reaction":lambda x: ','.join(x), 
                                        "KEGG":sum,
                                        "Reaction-Direction":lambda x:x.values[0], 
                                        "RIGHT":sum, 
                                        "LEFT":sum
                                       })
metacyc_df.loc[dups_idx[dups_idx==True].index] = dups

### Some reactions are defined by glycans

In [26]:
# glycan links
f = lambda x:x.split(":")[1]
df_gl_to_c = pd.read_csv(glycans_link_name, sep="\t", 
                         header=None, index_col=0, names=["gl", "compound"], 
                         converters={0:f, 1:f})
# remove glycans with multiple compound links (need to add seperately)
df_gl_to_c = df_gl_to_c[~df_gl_to_c.index.duplicated(False)]

### Edges

In [27]:
def orient_reaction_same(substrate_list, product_list, direction):
    if direction in ("PHYSIOL-LEFT-TO-RIGHT", "IRREVERSIBLE-LEFT-TO-RIGHT", "LEFT-TO-RIGHT"):
        # no change to orientation
        directed = True
        substrate_list, product_list = substrate_list, product_list

    elif direction in ("PHYSIOL-RIGHT-TO-LEFT", "IRREVERSIBLE-RIGHT-TO-LEFT", "RIGHT-TO-LEFT"):
        # swop orientation
        directed = True
        substrate_list, product_list = product_list, substrate_list    
    
    elif direction == "REVERSIBLE":
        directed = False

    else:
        print(direction, "why am I here")
    
    return substrate_list, product_list, directed

def orient_reaction_reversed(substrate_list, product_list, direction):
    # reaction is reversed from metacyc reaction definition
    if direction in ("PHYSIOL-LEFT-TO-RIGHT", "IRREVERSIBLE-LEFT-TO-RIGHT", "LEFT-TO-RIGHT"):
        # swop orientation
        directed = True
        substrate_list, product_list = product_list, substrate_list

    elif direction in ("PHYSIOL-RIGHT-TO-LEFT", "IRREVERSIBLE-RIGHT-TO-LEFT", "RIGHT-TO-LEFT"):
        # no change to orientation
        directed = True
        substrate_list, product_list = substrate_list, product_list

    elif direction == "REVERSIBLE":
        directed = False

    else:
        print(direction, "why am I here")

    return substrate_list, product_list, directed


def get_reaction_orientation_metacyc(substrate_list, product_list, ID):

    from_metacyc = False
    directed = False
    
    try:
        m = metacyc_df.loc[ID]
    except:
        # no metacyc - 
        return substrate_list, product_list, directed, from_metacyc
 
    direction = m["Reaction-Direction"]

    # manual checked
    if ID in manually_checked_same:
        from_metacyc = True
        substrate_list, product_list, directed = orient_reaction_same(substrate_list, product_list, direction)

    elif ID in manually_checked_reversed:
        from_metacyc = True
        substrate_list, product_list, directed = orient_reaction_reversed(substrate_list, product_list, direction)

    elif ID in manually_checked_different:
        pass 
    
    # test here
    else: 
        a = sum([i==j for i in m["RIGHT"] for j in product_list])   > 0 # == len(product_list)
        b = sum([i==j for i in m["LEFT"]  for j in substrate_list]) > 0 # == len(substrate_list)

        c = sum([i==j for i in m["RIGHT"] for j in substrate_list]) > 0 # == len(substrate_list)
        d = sum([i==j for i in m["LEFT"]  for j in product_list])   > 0 # == len(product_list)

        if (a and b):
            from_metacyc = True
            # same orientation
            substrate_list, product_list, directed = orient_reaction_same(substrate_list, product_list, direction)

        elif (c and d):
            from_metacyc = True
            # different orientation
            substrate_list, product_list, directed = orient_reaction_reversed(substrate_list, product_list, direction)

        else:
            pass
    
    return substrate_list, product_list, directed, from_metacyc

In [28]:
directions_from_metacyc = []
glycans_to_add = []
with open(product_relationship_file, 'w') as out_product:
    with open(substrate_relationship_file, 'w') as out_substrate:
        out_product.write(  '%s\t%s\t%s\n'%('rxnID', 'cpdID', 'STOICHIOMETRY') )
        out_substrate.write('%s\t%s\t%s\n'%('rxnID', 'cpdID', 'STOICHIOMETRY') )
        for i, row in df_reactions.iterrows():
            ID = row['ID']
            eqn = row['EQUATION']

            try:
                substrates, products = eqn.strip('"').split(' <=> ')
            except ValueError:
                print('Failed at reaction %s, eqn is %s'%(row['ID'], eqn))
                break

            substrate_list = []
            product_list = []
            stochiometry_dict = {}

            pattern = '^(.*?)\s*([a-zA-Z]{1}[\d]+)(.*?)$'
            for targets, direction, compound_list in [
                (substrates, 'substrate', substrate_list), 
                (products,   'product',   product_list)]:

                for t in targets.split(' + '):
                    stoichiometry_a, target, stoichiometry_b = re.match(pattern, t).groups()
                    if target[0] == "G":
                        try:
                            target = df_gl_to_c.loc[target]["compound"]
                        except KeyError:
                            glycans_to_add.append(target)

                    compound_list.append(target)

                    if stoichiometry_a: 
                        stoichiometry = stoichiometry_a.strip("(").strip(")")
                    elif stoichiometry_b:
                        stoichiometry = stoichiometry_b.strip("(").strip(")")
                    else:
                        stoichiometry = 1

                    stochiometry_dict[target] = stoichiometry

            substrate_list, product_list, directed, from_metacyc = get_reaction_orientation_metacyc(substrate_list, product_list, ID)

            for compound_list, file_ in ([substrate_list, out_substrate], 
                                         [product_list,   out_product]):
                
                for target in compound_list:
                    stoichiometry = stochiometry_dict[target]
                    s = '%s\t%s\t%s\n'%(ID, target, stoichiometry)
                    file_.write(s)
            
            if   (  directed and from_metacyc  ):
                d = "LEFTtoRIGHT"
            elif (  (not directed) and from_metacyc ):
                d = "REVERSIBLE"
            else:
                d = "None"
            
            directions_from_metacyc.append(d)
df_reactions ["DIRECTION_FROM_METACYC"] = directions_from_metacyc
    

In [29]:
df_reactions.to_csv(reaction_entities_file, encoding="utf-8", quoting=3, sep='\t', index=False)

## Compound vertices
Includes all compounds, and any glycans referenced to in reactions that were not replaced by a compound. 

In [30]:
df_compounds = pd.read_csv(compounds_file_name, sep="\t", na_values=[": NULL"])

In [31]:
df_glycans = pd.read_csv(glycans_file_name, sep="\t", na_values=[": NULL"])
df_glycans.set_index("ENTRY", drop=False, inplace=True)
df_glycans = df_glycans.loc[list(set(glycans_to_add))]
df_glycans.columns = df_compounds.columns

In [32]:
df_compounds = df_compounds.append(df_glycans)

In [33]:
# Create synonym list
df_compounds["SYNONYMS"] = [df_compounds["NAME"][i].split("; ") if df_compounds["NAME"][i] is not NaN
                            else [df_compounds["ENTRY"][i]] 
                            for i in range(df_compounds.shape[0])]
pd.isnull(df_compounds["SYNONYMS"]).sum()

0

In [34]:
# replace name
df_compounds["NAME"] = df_compounds["SYNONYMS"].apply(lambda x: "%s"%x[0])

In [35]:
# get the pathway ids 
df_compounds["PATHWAY"] = [re.findall(r'map[0-9]*', s) if s is not NaN 
                           else [] 
                           for s in df_compounds["PATHWAY"]]

In [36]:
# brite hierachy of compounds
with open(compounds_brite_file_name, "r") as handle:
    j = json.load(handle)

def tree(k):
    parent = re.sub('\[.*\]', '', k["name"]).strip()
    children = k["children"]
    for d in children:
        child = re.sub('\[.*\]', '', d["name"]).strip()
        if not "children" in d.keys():
            child = child.split(" ")[0]
            
        parents[child].append(parent)
        if "children" in d.keys():
            tree(d)
parents = defaultdict(list)
tree(j)

In [37]:
# dumb peptides linking to peptides
parents["Peptides"].pop(1)

'Peptides'

In [38]:
def get_ancestors(c):
    
    def recursive_ancestors(c, ancestors):
        if c in parents:
            for p in parents[c]:
                ancestors.append(p)
                ancestors = recursive_ancestors(p, ancestors)
        return ancestors
    
    ancestors = recursive_ancestors(c, [])
    return ancestors

In [39]:
df_compounds["BRITE_HIERARCHY"] = df_compounds["ENTRY"].apply(get_ancestors)

In [40]:
# all lists to strings
df_compounds["PATHWAY"] = df_compounds["PATHWAY"].apply(list_to_string)
df_compounds["SYNONYMS"] = df_compounds["SYNONYMS"].apply(list_to_string)
df_compounds["BRITE_HIERARCHY"] = df_compounds["BRITE_HIERARCHY"].apply(list_to_string)

In [41]:
df_compounds = df_compounds[["ENTRY", "NAME", "SYNONYMS", "FORMULA", "PATHWAY", "BRITE_HIERARCHY"]]

In [42]:
df_compounds.columns = ['ID', 
                        'NAME', 
                        'SYNONYMS', 
                        'FORMULA', 
                        'PATHWAY', 
                        'BRITE_HIERARCHY']

In [43]:
df_compounds.to_csv(compound_entities_file, encoding="utf-8", quoting=3, sep='\t', index=False)