In [3]:
from typing import Callable

def passfn(x):
    return x

# if parser (Callable) returns false, skip
def parse(fpath: str, key: str, fields: dict[str, Callable]) -> dict:
    file = open(fpath, encoding='latin-1')

    parsed = {}
    def pushItem():
        nonlocal item
        ik = item[key][0]
        del item[key]
        parsed[ik] = item
        item = {}

    def addField(f, v):
        if not v: return
        nonlocal item
        if f in item:
            data = item[f]
            data.append(v)
        else:
            item[f] = [v]

    item = {}
    def parseField(entry: str):
        tok = entry.split(' - ')
        field, value = tok[0], ' - '.join(tok[1:])
        
        if len(item)>0 and field == key:
            pushItem()

        if field == key:
            addField(field, value[:-1])
        elif field in fields:
            parser = fields[field]
            addField(field, parser(value[:-1]))

    entry = ''
    while 1:
        line = file.readline()
        if not line: # end
            parseField(entry)
            pushItem()
            break
        if line.startswith('#'): continue
        if line.startswith('//'): continue
        if not line.startswith('/'):
            parseField(entry)
            entry = ''

        entry += line
        
    file.close()
    return parsed

In [4]:
import numpy as np

In [5]:
x = parse('./proteins.dat', 'UNIQUE-ID', {
    'DBLINKS': lambda x: x.replace('(', '').replace('"', '').split(' ')[:2] if x.startswith('(UNIPROT') else False,
    'SPECIES': passfn,
    'GO-TERMS': passfn,
})
np.save('proteins', x)
list(x)[0], x[list(x)[0]]

('FERREDOXIN-MONOMER',
 {'DBLINKS': [['UNIPROT', 'P0A9R4']],
  'GO-TERMS': ['GO:0022900',
   'GO:0051536',
   'GO:0046872',
   'GO:0005829',
   'GO:0016226',
   'GO:0009055',
   'GO:0051537',
   'GO:0005515'],
  'SPECIES': ['TAX-511145']})

In [41]:
fname = './enzrxns.dat'
x = parse(fname, 'UNIQUE-ID', {
    'ENZYME': passfn,
    'COMMON-NAME': passfn,
    'REACTION': passfn,
})
np.save('.'.join(fname.split('.')[:-1]), x)
list(x)[0], x[list(x)[0]]

('TRANS-ENZRXN-325',
 {'COMMON-NAME': ['methionine transport'],
  'ENZYME': ['METNIQ-METHIONINE-ABC-CPLX'],
  'REACTION': ['TRANS-RXN0-202']})

In [10]:
fname = './reactions.dat'
x = parse(fname, 'UNIQUE-ID', {
    'IN-PATHWAY': passfn,
    'GIBBS-0': lambda x: x.strip(),
    'EC-NUMBER': passfn,
    'LEFT': passfn,
    'RIGHT': passfn,
    'REACTION-DIRECTION': passfn,
    'ENZYMATIC-REACTION': passfn,
    'COMMON-NAME': passfn,
})
np.save('.'.join(fname.split('.')[:-1]), x)
for k, v in x.items():
    if len(v)==8:
        break
k, v

('METHYLACYLYLCOA-HYDROXY-RXN',
 {'COMMON-NAME': ['methylacrylyl-CoA hydratase'],
  'EC-NUMBER': ['EC-4.2.1.150'],
  'ENZYMATIC-REACTION': ['ENZRXN-12079', 'ENZRXN-12077'],
  'GIBBS-0': ['-0.8670044'],
  'IN-PATHWAY': ['VALDEG-PWY'],
  'LEFT': ['CPD-12173'],
  'REACTION-DIRECTION': ['REVERSIBLE'],
  'RIGHT': ['METHACRYLYL-COA', 'WATER']})

In [6]:
fname = './pathways.dat'
x = parse(fname, 'UNIQUE-ID', {
    'REACTION-LIST': passfn,
    'REACTION-LAYOUT': passfn,
    'SPECIES': passfn,
    'SUPER-PATHWAYS': passfn,
    'COMMON-NAME': passfn,
    'SPECIES': passfn,
})
np.save('.'.join(fname.split('.')[:-1]), x)
for k, v in x.items():
    if len(v)==7:
        break
k, v

('PWY-7914',
 {'COMMON-NAME': ['coral bioluminescence'],
  'REACTION-LAYOUT': ['(RXN-18875 (:LEFT-PRIMARIES Coelenterazines) (:DIRECTION :L2R) (:RIGHT-PRIMARIES Coelenterazin-dioxetanone))',
   '(RXN-18867 (:LEFT-PRIMARIES CPD-20235) (:DIRECTION :L2R) (:RIGHT-PRIMARIES Excited-GFP))',
   '(RXN-18865 (:LEFT-PRIMARIES Excited-GFP) (:DIRECTION :L2R) (:RIGHT-PRIMARIES Light))',
   '(RXN-18866 (:LEFT-PRIMARIES Coelenterazin-dioxetanone) (:DIRECTION :L2R) (:RIGHT-PRIMARIES CPD-20235))',
   '(RXN-18859 (:LEFT-PRIMARIES Coelenterazine-CBP) (:DIRECTION :L2R) (:RIGHT-PRIMARIES Coelenterazines))',
   '(RXN-18858 (:LEFT-PRIMARIES Coelenterazines) (:DIRECTION :L2R) (:RIGHT-PRIMARIES Coelenterazine-CBP))',
   '(RENILLA-LUCIFERIN-SULFOTRANSFERASE-RXN (:LEFT-PRIMARIES RENILLA-LUCIFERIN) (:DIRECTION :R2L) (:RIGHT-PRIMARIES LUCIFERYL-SULFATE))'],
  'REACTION-LIST': ['RXN-18875',
   'RXN-18867',
   'RXN-18865',
   'RXN-18866',
   'RXN-18859',
   'RXN-18858',
   'RENILLA-LUCIFERIN-SULFOTRANSFERASE-RXN'],


In [20]:
import numpy as np

rxn = np.load('reactions.npy', allow_pickle=True).item()

In [22]:
rxn['RXN-9752']

{'COMMON-NAME': ['2-hydroxylamino-4,6-dinitrotoluene 3-<i>C</i>-glucosyltransferase'],
 'EC-NUMBER': ['EC-2.4.1'],
 'ENZYMATIC-REACTION': ['ENZRXN-15437',
  'ENZRXN-15436',
  'ENZRXN-15435',
  'ENZRXN-15434',
  'ENZRXN-15433',
  'ENZRXN-15432'],
 'GIBBS-0': ['-9.382385'],
 'IN-PATHWAY': ['PWY-6051'],
 'LEFT': ['CPD-12575', 'CPD-10447'],
 'REACTION-DIRECTION': ['LEFT-TO-RIGHT'],
 'RIGHT': ['CPD-10449', 'UDP', 'PROTON']}