In [3]:
import re
import numpy as np
WorkDir = 'C:/Users/tinta/OneDrive/Documents/Projects/BRENDA'
DataFile = WorkDir + '/brenda_download.txt'

In [4]:
with open(DataFile, encoding="iso-8859-1") as file:
    data = file.read()


In [5]:
ECidx = [( m.start(), m.end() )
         for m in re.finditer('(?<=ID\\t)(.*)(?=\\n)', data)]

In [17]:
i=4000
data[ECidx[i][0]:ECidx[i][1]]

'2.7.1.35'

In [97]:
import re
import numpy as np

class BRENDA:
    """
    Provides methods to parse the BRENDA database (https://www.brenda-enzymes.org/)
    """
    def __init__(self, path_to_database):
        def getECnumberIndices() -> dict:
            EC_pattern = '(?<=ID\\t)(.*)(?=\\n)'
            return {self.data[m.start():m.end()]: (m.start(), m.end()) 
                    for m in re.finditer(EC_pattern, self.data)}
        with open(path_to_database, encoding="iso-8859-1") as file:
            self.data = file.read()
        self.ECIndices = getECnumberIndices()
        
    def getNextEnzymeIdx(self, ec_number: str) -> int:
        try: 
            key_idx = list(self.ECIndices).index(ec_number)
            if key_idx == len(self.ECIndices):
                return len(self.data)
            else:
                return self.ECIndices[list(self.ECIndices.keys())[key_idx + 1]][0]     
        except:
            raise ValueError("Ec number not in database")
    
    def getKMvalues(self, ec_number: str, substrate: str=None) -> dict:
        """
        Returns a dictionary with all KM values of the enzyme with
        given EC number. If a substrate is given, then results are 
        restricted to that substrate.
        """
        search_indices = (self.ECIndices[ec_number][1], self.getNextEnzymeIdx(ec_number))
        
        def getEnzymeSubstrates(KM_lines):
            substrates = []
            for line in KM_lines:
                sub = extractKMInfo(line)['substrate']
                if sub not in substrates:
                    substrates.append(sub)
            return substrates
        
        def extractKMInfo(KM_line):
            res = {}
            try: 
                species = re.search('#(.+?)#', KM_line).group(1).split(',')
                res['species'] = species
            except:
                res['species'] = ''
            try:
                KM_value = re.search('# (.+?) {', KM_line).group(1)
                res['KM'] = KM_value
            except:
                res['KM'] = np.nan
            try:
                substrate = re.search('{(.+?)}', KM_line).group(1)
                res['substrate'] = substrate
            except:
                res['substrate'] = ''
            try:    
                meta = re.search('\((.+?)\)', KM_line).group(1)
                res['meta'] = meta
            except:
                res['meta'] = ''
            try:
                references = re.search('<(.+?)>', KM_line).group(1)
                res['references'] = references
            except:
                res['references'] = ''
            return res
        
        def getKMlines(search_indices):
            return [p.group(1) for p in re.finditer("KM\t(.+?)\nKM",
                                                    self.data[search_indices[0]:search_indices[1]])]
        
        KM_lines = getKMlines(search_indices)
        enzyme_substrates = getEnzymeSubstrates(KM_lines)
        if substrate is None:
            res = {s: [] for s in enzyme_substrates}
            for line in KM_lines:
                KM_info = extractKMInfo(line)
                res[KM_info['substrate']].append(float(KM_info['KM']))
            return res
        else:
            res = {substrate: []}
            for line in KM_lines:
                KM_info = extractKMInfo(line)
                if KM_info['substrate'] == substrate:
                    res[substrate].append(float(KM_info['KM']))
            return res

In [98]:
brenda = BRENDA(DataFile)

In [96]:
#brenda.getKMvalues("2.7.1.35")
'\nKM\t#15# 0.022 {pyridoxal}  (#15# pH 7, 37Â°C <24>) <24>\nKM\t'
def getKMlines(data, search_indices):
    return [p.group(1) for p in re.finditer("KM\t(.+?)\nKM",
                                            data[search_indices[0]:search_indices[1]])]

#brenda.data[97288258:97360800]
getKMlines(brenda.data, (97288258, 97360800))

['#13# -999 {more}  (#13# effect of KCl on Km-values <17>) <17>',
 '#13# 0.06 {ZnATP2-}  (#13# pH 5.8, 37Â°C <17>) <17>',
 '#13# 0.006 {pyridoxamine}  (#13# pH 6.5 <15>) <15>',
 '#13# 0.101 {MgATP2-}  (#13# pH 5.8, 37Â°C <17>) <17>',
 '#15# 0.022 {pyridoxal}  (#15# pH 7, 37Â°C <24>) <24>',
 '#15# 0.0091 {ATP}  (#15# pH 7, 37Â°C <24>) <24>',
 '#15# 0.127 {ATP}  (#15# pH 7, 37Â°C, recombinant enzyme <24>) <24>',
 '#21# 0.07 {pyridoxal}  (#21# pH 6.5, 30Â°C <32>) <32>',
 '#21# 0.212 {pyridoxine}  (#21# pH 6.5, 30Â°C <32>) <32>',
 '#3# 0.047 {pyridoxal}  (#3# pH 8, 37Â°C <28>) <28>',
 '#32# 0.102 {ATP}  (#32# wild type enzyme, at pH 7.4 and 37Â°C <56>) <56>',
 '#6# 0.17 {pyridoxal}  (#6# D235A mutant protein <48>) <48>',
 '#6# 0.18 {MgATP}  (#6# D235N mutant protein <48>) <48>',
 '#6# 0.49 {pyridoxal}  (#6# at pH 6.5 and 37Â°C <58>) <58>',
 '#6# 0.033 {pyridoxal}  <5>',
 '#6# 0.024 {pyridoxal}  (#6# wild-type protein <48>) <48>',
 '#6# 0.016 {pyridoxine}  <5>',
 '#6# 0.058 {pyridoxal}  (#6

In [99]:
brenda.getKMvalues("2.7.1.35")

{'more': [-999.0],
 'ZnATP2-': [0.06],
 'pyridoxamine': [0.006],
 'MgATP2-': [0.101],
 'pyridoxal': [0.022, 0.07, 0.047, 0.17, 0.49, 0.033, 0.024, 0.058, 0.025],
 'ATP': [0.0091, 0.127, 0.102, 0.012],
 'pyridoxine': [0.212, 0.016, 0.00172],
 'MgATP': [0.18],
 "4'-O-methylpyridoxine": [0.00495]}

In [101]:
brenda.getKMvalues("2.7.1.35", "ATP")

{'ATP': [0.0091, 0.127, 0.102, 0.012]}

In [None]:
"""
REFERENCE
RF	<1> Green, P.N.; Gibson, D.M.: Carbohydrate metabolism in some
	methylotrophic bacteria. FEMS Microbiol. Lett. (1984) 23, 31-34.
	{Pubmed:} (c)
RF	<2> Cline, A.L.; Hu, A.S.L.: The isolation of three sugar
	dehydrogenases from a pseudomonad. J. Biol. Chem. (1965) 240,
	4488-4492. {Pubmed:5845847} (c)
"""

"""
PROTEIN
PR	#1# Sus scrofa   (#1# protein comprises a small N-terminal LPMO10
	module named LPMO10A followed by a family 5/12 CBM and a C-terminal
	GH18 module <9>) <9>
PR	#2# Oryctolagus cuniculus   (#2# fragment of dihydropteroate synthase
	<8>) <8>
PR	#3# Pseudomonas sp.   <2,10>
PR	#4# Acinetobacter sp.   <13>
PR	#5# Agrobacterium tumefaciens   <1,3>
PR	#6# Aureobasidium pullulans   <4>
PR	#7# Ovis aries aries   (#7# adh, fragment <6,7>) <6,7>
PR	#8# Trinickia caryophylli   <5>
PR	#9# Saccharolobus solfataricus Q97YM2 SwissProt <11,12>

RECOMMENDED_NAME
"""