In [10]:
from typing import Callable
import numpy as np

################################################################
# this notebook parses the metacyc pgdb flatfiles
# into more usable numpy dictionaries
# obtain and download the tier 1 biocyc database, metacyc
# https://biocyc.org/download.shtml
METACYC = '<path to metacyc>/eco_meta_cyc_25-5/meta/25.1/data'
################################################################

# if parser (Callable) returns false, skip
def parse(fpath: str, key: str, fields: dict[str, Callable]) -> dict:
    file = open(f'{METACYC}/fpath'.replace('//', '/'), encoding='latin-1')

    parsed = {}
    def pushItem():
        nonlocal item
        ik = item[key][0]
        del item[key]
        parsed[ik] = item
        item = {}

    def addField(f, v):
        if not v: return
        nonlocal item
        if f in item:
            data = item[f]
            data.append(v)
        else:
            item[f] = [v]

    item = {}
    def parseField(entry: str):
        tok = entry.split(' - ')
        field, value = tok[0], ' - '.join(tok[1:])
        
        if len(item)>0 and field == key:
            pushItem()

        if field == key:
            addField(field, value[:-1])
        elif field in fields:
            parser = fields[field]
            addField(field, parser(value[:-1]))

    entry = ''
    while 1:
        line = file.readline()
        if not line: # end
            parseField(entry)
            pushItem()
            break
        if line.startswith('#'): continue
        if line.startswith('//'): continue
        if not line.startswith('/'):
            parseField(entry)
            entry = ''

        entry += line
        
    file.close()
    return parsed

def passfn(x):
    return x

In [11]:
fname = 'proteins.dat'
x = parse(fname, 'UNIQUE-ID', {
    'DBLINKS': lambda x: x.replace('(', '').replace('"', '').split(' ')[:2] if x.startswith('(UNIPROT') else False,
    'COMMON-NAME': passfn,
})
np.save('proteins_uniprot', x)
list(x)[0], x[list(x)[0]]

('FERREDOXIN-MONOMER',
 {'COMMON-NAME': ['reduced ferredoxin'], 'DBLINKS': [['UNIPROT', 'P0A9R4']]})

In [41]:
fname = 'enzrxns.dat'
x = parse(fname, 'UNIQUE-ID', {
    'ENZYME': passfn,
    'COMMON-NAME': passfn,
    'REACTION': passfn,
})
np.save('.'.join(fname.split('.')[:-1]), x)
list(x)[0], x[list(x)[0]]

('TRANS-ENZRXN-325',
 {'COMMON-NAME': ['methionine transport'],
  'ENZYME': ['METNIQ-METHIONINE-ABC-CPLX'],
  'REACTION': ['TRANS-RXN0-202']})

In [33]:
fname = 'reactions.dat'
x = parse(fname, 'UNIQUE-ID', {
    'IN-PATHWAY': passfn,
    'COMMON-NAME': passfn,
    'GIBBS-0': lambda x: x.strip(),
    'EC-NUMBER': passfn,
    'LEFT': passfn,
    'RIGHT': passfn,
    'REACTION-DIRECTION': passfn,
    'ENZYMATIC-REACTION': passfn,
})
np.save('.'.join(fname.split('.')[:-1]), x)
for k, v in x.items():
    if len(v)==7:
        break
k, v

('RXN-13088',
 {'EC-NUMBER': ['EC-1.4.3.24'],
  'ENZYMATIC-REACTION': ['ENZRXN-23736', 'ENZRXN-20031'],
  'GIBBS-0': ['-4.6129684'],
  'IN-PATHWAY': ['PWY66-201', 'PWY-6993'],
  'LEFT': ['CPD-14092', 'OXYGEN-MOLECULE', 'WATER'],
  'REACTION-DIRECTION': ['PHYSIOL-LEFT-TO-RIGHT'],
  'RIGHT': ['CPD-14100', 'METHYLAMINE', 'HYDROGEN-PEROXIDE']})