In [5]:
import requests
import pandas as pd
import xml.etree.ElementTree as et
from itertools import chain
from collections import defaultdict
from pathlib import Path

In [13]:
def parse_kgml(f_path):
    def make_default_dict(items):
        res = defaultdict(list)
        for k, v in items:
            res[k].append(v)
        return res

    tree = et.parse(f_path)
    root = tree.getroot()
    enz_to_rcn = []
    rcn_to_prod = []
    sub_to_rcn = []

    for c in root:
        if c.tag == 'entry':
            if c.attrib['type'] == 'enzyme':
                ecs = c.attrib["name"].split(" ")
                rcn = c.attrib['id']
                enz_to_rcn.append([(ec, rcn) for ec in ecs])
        if c.tag == 'reaction':
            for c2 in c:
                if c2.tag == 'product':
                    rcn_to_prod.append((c.attrib['id'], c2.attrib['id']))
                elif c2.tag == 'substrate':
                    sub_to_rcn.append((c2.attrib['id'], c.attrib['id']))

    # we'll loop through every enzyme
    er_list = list(chain.from_iterable(enz_to_rcn))
    re_map = make_default_dict([(v, k) for k, v in er_list])
    rp_map = dict(rcn_to_prod)
    sr_map = dict(sub_to_rcn)

    links = []
    no_links = []
    for ec, rcn in er_list:
        try:
            target_rcn = sr_map[rp_map[rcn]]
            for x in re_map[target_rcn]:
                links.append((ec, x))
        except KeyError:
            no_links.append((ec, rcn))
    link_df = pd.DataFrame.from_records(links, columns=['source', 'target'])
    link_df['pathway_number'] = root.attrib['number']
    # print(f'{len(link_df)} ECs with links, {len(no_links)} ECs with no links in {root.attrib['number']}.')
    return link_df

In [7]:
target_dir = Path('../pathways')
all_pathways = pd.read_csv(target_dir / 'pathway_manifest.csv', dtype='str')

In [None]:
for row in all_pathways.to_dict('records'):
    n = row['number']
    # download xml if needed
    xml_ok = False
    xml_path = target_dir / 'raw' / f'{n}.kgml'
    if not xml_path.is_file():
        resp = requests.get(f'https://rest.kegg.jp/get/ec{n}/kgml')
        if resp.status_code == 200:
            with open(xml_path, 'w') as tf:
                tf.write(resp.text)
                xml_ok = True
        else:
            print(f'{n} request failed')
    else:
        xml_ok = True
    
    # parse if needed
    target_path = target_dir / 'parsed' / f'{n}.csv'
    if not target_path.is_file() and xml_ok:
        links_df = parse_kgml(xml_path)
        links_df.to_csv(target_path, index=False)

01200 request failed
01210 request failed
01212 request failed
01230 request failed
01232 request failed
01250 request failed
01240 request failed
01220 request failed
00196 request failed
00542 request failed
00543 request failed
01052 request failed
01054 request failed
00403 request failed
01010 request failed
01060 request failed
01061 request failed
01062 request failed
01063 request failed
01064 request failed
01065 request failed
01066 request failed
01070 request failed
