In [2]:
# General
import os
from os.path import join
CURRENT_DIR = os.getcwd()
import time
import requests
import random
import re

# Data wrangling
import pandas as pd
import numpy as np
import itertools

# Data visualisation
from matplotlib import pyplot as plt

# Datasets
from brendapyrser import BRENDA
from rdkit.Chem import Descriptors, MolFromSmiles, rdFingerprintGenerator
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.DataStructs import FingerprintSimilarity
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
import pubchempy as pcp
from Bio import Entrez, SeqIO
import esm

# ML
import torch

### Load BRENDA

In [18]:
dataFile = join(CURRENT_DIR, '..' , 'Data', 'BRENDA', 'brenda_2023_1.txt')
brenda_client = BRENDA(dataFile)
brenda_client

0,1
Number of Enzymes,7754
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


# Enzymes

### Get EC numbers for all Kinases

In [61]:
# Initialize a list to store enzyme information dictionaries
enzyme_info_list = []
for reaction in brenda_client.reactions:
    if bool(re.match(r'^\d+\.\d+\.\d+\.\d+$', reaction.ec_number)):
        enzyme_dict = {
            'ec_number': reaction.ec_number,
            'enzyme': reaction.name,
            'systematic_name': reaction.systematic_name,
            'reaction_type': reaction.reaction_type,
            'substrates': [entry['substrates'] for entry in reaction.substratesAndProducts],
            'products': [entry['products'] for entry in reaction.substratesAndProducts],
        }
        enzyme_info_list.append(enzyme_dict)
        print('Added', reaction.ec_number)

enzyme_df = pd.DataFrame(enzyme_info_list)
print('We have', enzyme_df.shape[0], 'unique enzymes')

Added 1.1.1.1
Added 1.1.1.10
Added 1.1.1.100
Added 1.1.1.101
Added 1.1.1.102
Added 1.1.1.103
Added 1.1.1.104
Added 1.1.1.105
Added 1.1.1.106
Added 1.1.1.107
Added 1.1.1.108
Added 1.1.1.11
Added 1.1.1.110
Added 1.1.1.111
Added 1.1.1.112
Added 1.1.1.113
Added 1.1.1.114
Added 1.1.1.115
Added 1.1.1.116
Added 1.1.1.117
Added 1.1.1.118
Added 1.1.1.119
Added 1.1.1.12
Added 1.1.1.120
Added 1.1.1.121
Added 1.1.1.122
Added 1.1.1.123
Added 1.1.1.124
Added 1.1.1.125
Added 1.1.1.126
Added 1.1.1.127
Added 1.1.1.129
Added 1.1.1.13
Added 1.1.1.130
Added 1.1.1.131
Added 1.1.1.132
Added 1.1.1.133
Added 1.1.1.134
Added 1.1.1.135
Added 1.1.1.136
Added 1.1.1.137
Added 1.1.1.138
Added 1.1.1.14
Added 1.1.1.140
Added 1.1.1.141
Added 1.1.1.142
Added 1.1.1.143
Added 1.1.1.144
Added 1.1.1.145
Added 1.1.1.146
Added 1.1.1.147
Added 1.1.1.148
Added 1.1.1.149
Added 1.1.1.15
Added 1.1.1.150
Added 1.1.1.151
Added 1.1.1.152
Added 1.1.1.153
Added 1.1.1.154
Added 1.1.1.156
Added 1.1.1.157
Added 1.1.1.159
Added 1.1.1.16
A

In [80]:
enzyme_df

Unnamed: 0,ec_number,enzyme,systematic_name,reaction_type,substrates,products
0,1.1.1.1,Alcohol dehydrogenase,alcohol:NAD+ oxidoreductase,Redox reaction,"[[H, NADH, acetaldehyde], [H, NADH, methylglyo...","[[NAD, ethanol], [NAD, acetol], [4-deoxy-L-ery..."
1,1.1.1.10,L-xylulose reductase,xylitol:NADP+ 4-oxidoreductase (L-xylulose-for...,Redox reaction,"[[NADP, xylitol], [L-xylulose, NADH], [H, L-xy...","[[H, L-xylulose, NADPH], [L-xylitol, NAD], [L-..."
2,1.1.1.100,3-oxoacyl-[acyl-carrier-protein] reductase,(3R)-3-hydroxyacyl-[acyl-carrier protein]:NADP...,Redox reaction,"[[NADPH, beta-ketoacyl-[acyl-carrier protein]]...","[[NADP, beta-hydroxyacyl-[acyl-carrier protein..."
3,1.1.1.101,Acylglycerone-phosphate reductase,1-palmitoylglycerol-3-phosphate:NADP+ oxidored...,Oxidation,"[[NADPH, acyldihydroxyacetone phosphate], [H, ...","[[1-acyl-sn-glycerol3-phosphate, NADP], [1-alk..."
4,1.1.1.102,3-dehydrosphinganine reductase,D-erythro-dihydrosphingosine:NADP+ 3-oxidoredu...,Redox reaction,"[[3-oxosphinganine, H, NADPH], [2-amino-1-hydr...","[[NADP, dihydrosphingosine], [2-amino-1,3-dihy..."
...,...,...,...,...,...,...
6609,7.6.2.5,Abc-type heme transporter,"ATP phosphohydrolase (ABC-type, heme-exporting)",Hydrolysis of phosphoric ester,"[[ATP, H2O, heme[side 1]], [ATP, H2O, heme/in]]","[[ADP, heme[side 2], phosphate], [ADP, heme/ou..."
6610,7.6.2.6,Abc-type guanine transporter,"ATP phosphohydrolase (ABC-type, guanine-import...",Hydrolysis of phosphoric ester,"[[ATP, H2O, guanine/out], [ATP, H2O, tryptopha...","[[ADP, guanine/in, phosphate], [ADP, phosphate..."
6611,7.6.2.7,Abc-type taurine transporter,"ATP phosphohydrolase (ABC-type, taurine-import...",Transmembrane transport,"[[ATP, H2O, taurine/out]]","[[ADP, phosphate, taurine/in]]"
6612,7.6.2.8,Abc-type vitamin b12 transporter,"ATP phosphohydrolase (ABC-type, vitamin B12-im...",Hydrolysis of phosphoric ester,"[[ATP, H2O, vitamin B12/out], [ATP, H2O, cobin...","[[ADP, phosphate, vitaminB12/in], [ADP, [cobal..."


### Get Enzyme Sequences

In [63]:
# Function to fetch sequence from UniProt using the enzyme name
def fetch_uniprot_sequence(enzyme_name):
    url = f"https://www.uniprot.org/uniprot/?query={enzyme_name}&format=fasta&limit=1"
    response = requests.get(url)
    if response.ok:
        fasta_data = response.text
        if fasta_data:
            seq_record = SeqIO.read(fasta_data.splitlines(), "fasta")
            return str(seq_record.seq)
    return None

# Ensure you provide your email to NCBI Entrez
Entrez.email = "pcanocarciofa@gmail.com"

# Function to fetch sequence using Entrez and enzyme name
def fetch_ncbi_sequence(enzyme_name):
    try:
        search_handle = Entrez.esearch(db="protein", term=enzyme_name, retmax=1)
        search_results = Entrez.read(search_handle)
        search_handle.close()
        if search_results["IdList"]:
            protein_id = search_results["IdList"][0]
            fetch_handle = Entrez.efetch(db="protein", id=protein_id, rettype="fasta", retmode="text")
            seq_record = SeqIO.read(fetch_handle, "fasta")
            fetch_handle.close()
            return str(seq_record.seq)
    except Exception as e:
        print(f"Error fetching sequence for {enzyme_name}: {e}")
    return None

# List to store enzyme sequences
enzyme_sequences = []

# Iterate over each row in enzymes_df to fetch and add sequences
for index, row in enzyme_df.iterrows():
    enzyme_name = row['enzyme']
    sequence = fetch_uniprot_sequence(enzyme_name)
    if not sequence:
        sequence = fetch_ncbi_sequence(enzyme_name)
    print('Got sequence for', row['ec_number'], row['enzyme'], index + 1, '/', enzyme_df.shape[0])
    enzyme_sequences.append(sequence)

# Add the sequences to the dataframe
enzyme_df['sequence'] = enzyme_sequences
print('We have', enzyme_df['sequence'].notna().sum(), 'enzymes with sequences out of', enzyme_df.shape[0], 'enzymes')

Got sequence for 1.1.1.1 Alcohol dehydrogenase 1 / 6614
Got sequence for 1.1.1.10 L-xylulose reductase 2 / 6614
Got sequence for 1.1.1.100 3-oxoacyl-[acyl-carrier-protein] reductase 3 / 6614
Got sequence for 1.1.1.101 Acylglycerone-phosphate reductase 4 / 6614
Got sequence for 1.1.1.102 3-dehydrosphinganine reductase 5 / 6614
Got sequence for 1.1.1.103 L-threonine 3-dehydrogenase 6 / 6614
Got sequence for 1.1.1.104 4-oxoproline reductase 7 / 6614
Got sequence for 1.1.1.105 All-trans-retinol dehydrogenase (nad+) 8 / 6614
Got sequence for 1.1.1.106 Pantoate 4-dehydrogenase 9 / 6614
Got sequence for 1.1.1.107 Pyridoxal 4-dehydrogenase 10 / 6614
Got sequence for 1.1.1.108 Carnitine 3-dehydrogenase 11 / 6614
Got sequence for 1.1.1.11 D-arabinitol 4-dehydrogenase 12 / 6614
Got sequence for 1.1.1.110 Aromatic 2-oxoacid reductase 13 / 6614
Got sequence for 1.1.1.111 3-(imidazol-5-yl)lactate dehydrogenase 14 / 6614
Got sequence for 1.1.1.112 Indanol dehydrogenase 15 / 6614
Got sequence for 1.1.

ConnectionError: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprot/query=Myosin%20atpase&format=fasta&limit=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x34d0faa30>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [8]:
enzyme_df

Unnamed: 0,ec_number,enzyme,systematic_name,reaction_type,substrates,products,sequence,ESM1b
0,1.1.1.1,Alcohol dehydrogenase,alcohol:NAD+ oxidoreductase,Redox reaction,"[[H, NADH, acetaldehyde], [H, NADH, methylglyo...","[[NAD, ethanol], [NAD, acetol], [4-deoxy-L-ery...",MRALCWNGVNDLRVETVPDPVLVNPRDVILKVGLTTTCGSDLHFID...,"[-0.09155001491308212, 0.178570955991745, -0.0..."
1,1.1.1.10,L-xylulose reductase,xylitol:NADP+ 4-oxidoreductase (L-xylulose-for...,Redox reaction,"[[NADP, xylitol], [L-xylulose, NADH], [H, L-xy...","[[H, L-xylulose, NADPH], [L-xylitol, NAD], [L-...",MDLRLAGRRALVTGAGKGIGCSIVKALHAAGARVVAVSRTQADLDS...,"[-0.01232220884412527, 0.13275030255317688, 0...."
2,1.1.1.100,3-oxoacyl-[acyl-carrier-protein] reductase,(3R)-3-hydroxyacyl-[acyl-carrier protein]:NADP...,Redox reaction,"[[NADPH, beta-ketoacyl-[acyl-carrier protein]]...","[[NADP, beta-hydroxyacyl-[acyl-carrier protein...",MNVLNKIALVTGGGTGIGKAASMELAKRGAIVAVNYSRSQSEAEET...,"[0.06207834184169769, 0.10712146013975143, -0...."
3,1.1.1.101,Acylglycerone-phosphate reductase,1-palmitoylglycerol-3-phosphate:NADP+ oxidored...,Oxidation,"[[NADPH, acyldihydroxyacetone phosphate], [H, ...","[[1-acyl-sn-glycerol3-phosphate, NADP], [1-alk...",MSQTQRRKVAVVTGASSGIGYEVTKELARKGFKVFACARRTAPIEP...,"[0.07941767573356628, 0.23767024278640747, -0...."
4,1.1.1.102,3-dehydrosphinganine reductase,D-erythro-dihydrosphingosine:NADP+ 3-oxidoredu...,Redox reaction,"[[3-oxosphinganine, H, NADPH], [2-amino-1-hydr...","[[NADP, dihydrosphingosine], [2-amino-1,3-dihy...",MELSWEIVLCVGIAVLVHVLIYLFVMGKRPSSIVGRHVVVTGGSKG...,"[0.13308744132518768, 0.24956104159355164, 0.0..."
...,...,...,...,...,...,...,...,...
6609,7.6.2.5,Abc-type heme transporter,"ATP phosphohydrolase (ABC-type, heme-exporting)",Hydrolysis of phosphoric ester,"[[ATP, H2O, heme[side 1]], [ATP, H2O, heme/in]]","[[ADP, heme[side 2], phosphate], [ADP, heme/ou...",MVTVGNYCEAEGPLGPAWAQNGLSPCFFFTLVPSTLMALGALALVL...,"[-0.08002226054668427, 0.2539653182029724, 0.1..."
6610,7.6.2.6,Abc-type guanine transporter,"ATP phosphohydrolase (ABC-type, guanine-import...",Hydrolysis of phosphoric ester,"[[ATP, H2O, guanine/out], [ATP, H2O, tryptopha...","[[ADP, guanine/in, phosphate], [ADP, phosphate...",MSLVNIAGLHKHFGSNHVLKGIDLDVQQGDVVALIGRSGSGKSTLL...,"[0.11096367985010147, 0.10357015579938889, -0...."
6611,7.6.2.7,Abc-type taurine transporter,"ATP phosphohydrolase (ABC-type, taurine-import...",Transmembrane transport,"[[ATP, H2O, taurine/out]]","[[ADP, phosphate, taurine/in]]",MRKLINYQPLPLTRGMMGFLPLLALLLVYLMASDARLAANAADKLL...,"[0.11686123162508011, 0.08438248187303543, -0...."
6612,7.6.2.8,Abc-type vitamin b12 transporter,"ATP phosphohydrolase (ABC-type, vitamin B12-im...",Hydrolysis of phosphoric ester,"[[ATP, H2O, vitamin B12/out], [ATP, H2O, cobin...","[[ADP, phosphate, vitaminB12/in], [ADP, [cobal...",QEVKVKDYFGEQTIKLPVSKIIYLGSFAEVPAMFHTWDRVVGISDY...,"[0.04568294808268547, 0.1668546497821808, -0.0..."


In [1]:
enzyme_df.to_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'enzyme_df_sequences.pkl'))

NameError: name 'enzyme_df' is not defined

### Get ESM-1b Representations for Enzymes

In [None]:
enzyme_df = pd.read_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'enzyme_df_sequences.pkl'))

In [117]:
output_file_path = join(CURRENT_DIR, ".." ,"data", "enzyme_data", "all_sequences.fasta")

# Open the file for writing
with open(join(CURRENT_DIR, ".." ,"Data", "all_sequences.fasta"), "w") as ofile:
    # Iterate over each row in the dataframe
    for index, row in enzyme_df.iterrows():
        seq = row["sequence"]
        if not pd.isnull(seq):
            # Write the sequence in FASTA format
            ofile.write(">" + str(index) + "\n" + seq[:1018]  + "\n")

print(f"FASTA file created at {output_file_path}")

FASTA file created at /Users/pablocanocarciofa/Library/Mobile Documents/com~apple~CloudDocs/Masters/Project/Github/Enzyme-Substrate-Activity-Prediction/Code/../data/enzyme_data/all_sequences.fasta


##### Command Line Code
python extract.py esm1b_t33_650M_UR50S "/Users/pablocanocarciofa/Library/Mobile Documents/com~apple~CloudDocs/Masters/Project/Github/Enzyme-Substrate-Activity-Prediction/Data/all_sequences.fasta" "/Users/pablocanocarciofa/Library/Mobile Documents/com~apple~CloudDocs/Masters/Project/Github/Enzyme-Substrate-Activity-Prediction/Data/ESM_1b" --repr_layers 33 --include mean

In [19]:
def load_embedding(enzyme):
    try:
        embedding_path = join(CURRENT_DIR, "..", "Data", "ESM_1b", f"{enzyme}.pt")
        embedding = torch.load(embedding_path)
        embedding = embedding['mean_representations'][33].numpy().tolist()
        return embedding
    except FileNotFoundError:
        pass

esm1b_series = pd.Series([load_embedding(int(idx)) for idx in enzyme_df.index], index=enzyme_df.index)

# Assign the Series to a new column 'ESM1b' in the DataFrame
enzyme_df['ESM1b'] = esm1b_series
print('We have', enzyme_df['ESM1b'].notna().sum(), 'enzymes with an ESM1b representation, out of', enzyme_df['sequence'].notna().sum(), 'enzymes with a sequence')

We have 5866 enzymes with an ESM1b representation, out of 5824 enzymes with a sequence


### Get ESM-2 Representations for Enzymes

##### Command Line Code
python extract.py esm2_t33_650M_UR50D "/Users/pablocanocarciofa/Library/Mobile Documents/com~apple~CloudDocs/Masters/Project/Github/Enzyme-Substrate-Activity-Prediction/Data/all_sequences.fasta" "/Users/pablocanocarciofa/Library/Mobile Documents/com~apple~CloudDocs/Masters/Project/Github/Enzyme-Substrate-Activity-Prediction/Data/ESM_2" --repr_layers 33 --include mean

In [20]:
def load_embedding(enzyme):
    try:
        embedding_path = join(CURRENT_DIR, "..", "Data", "ESM_2", f"{enzyme}.pt")
        embedding = torch.load(embedding_path)
        embedding = embedding['mean_representations'][33].numpy().tolist()
        return embedding
    except FileNotFoundError:
        pass

esm2_series = pd.Series([load_embedding(int(idx)) for idx in enzyme_df.index], index=enzyme_df.index)

# Assign the Series to a new column 'ESM1b' in the DataFrame
enzyme_df['ESM2'] = esm2_series
print('We have', enzyme_df['ESM2'].notna().sum(), 'enzymes with an ESM2 representation, out of', enzyme_df['sequence'].notna().sum(), 'enzymes with a sequence')

We have 6031 enzymes with an ESM2 representation, out of 5824 enzymes with a sequence


In [21]:
# drop enzymes without a sequence or ESM1b representation
enzyme_df = enzyme_df[~enzyme_df['sequence'].isna()].reset_index(drop=True)

In [22]:
enzyme_df

Unnamed: 0,ec_number,enzyme,systematic_name,reaction_type,substrates,products,sequence,ESM1b,ESM2
0,1.1.1.1,Alcohol dehydrogenase,alcohol:NAD+ oxidoreductase,Redox reaction,"[[H, NADH, acetaldehyde], [H, NADH, methylglyo...","[[NAD, ethanol], [NAD, acetol], [4-deoxy-L-ery...",MRALCWNGVNDLRVETVPDPVLVNPRDVILKVGLTTTCGSDLHFID...,"[-0.09155001491308212, 0.178570955991745, -0.0...","[-0.03778128698468208, 0.019074678421020508, -..."
1,1.1.1.10,L-xylulose reductase,xylitol:NADP+ 4-oxidoreductase (L-xylulose-for...,Redox reaction,"[[NADP, xylitol], [L-xylulose, NADH], [H, L-xy...","[[H, L-xylulose, NADPH], [L-xylitol, NAD], [L-...",MDLRLAGRRALVTGAGKGIGCSIVKALHAAGARVVAVSRTQADLDS...,"[-0.01232220884412527, 0.13275030255317688, 0....","[-0.0575292706489563, -0.06821764260530472, 0...."
2,1.1.1.100,3-oxoacyl-[acyl-carrier-protein] reductase,(3R)-3-hydroxyacyl-[acyl-carrier protein]:NADP...,Redox reaction,"[[NADPH, beta-ketoacyl-[acyl-carrier protein]]...","[[NADP, beta-hydroxyacyl-[acyl-carrier protein...",MNVLNKIALVTGGGTGIGKAASMELAKRGAIVAVNYSRSQSEAEET...,"[0.06207834184169769, 0.10712146013975143, -0....","[0.04713781177997589, -0.06532732397317886, 0...."
3,1.1.1.101,Acylglycerone-phosphate reductase,1-palmitoylglycerol-3-phosphate:NADP+ oxidored...,Oxidation,"[[NADPH, acyldihydroxyacetone phosphate], [H, ...","[[1-acyl-sn-glycerol3-phosphate, NADP], [1-alk...",MSQTQRRKVAVVTGASSGIGYEVTKELARKGFKVFACARRTAPIEP...,"[0.07941767573356628, 0.23767024278640747, -0....","[0.023076308891177177, -0.05737697705626488, -..."
4,1.1.1.102,3-dehydrosphinganine reductase,D-erythro-dihydrosphingosine:NADP+ 3-oxidoredu...,Redox reaction,"[[3-oxosphinganine, H, NADPH], [2-amino-1-hydr...","[[NADP, dihydrosphingosine], [2-amino-1,3-dihy...",MELSWEIVLCVGIAVLVHVLIYLFVMGKRPSSIVGRHVVVTGGSKG...,"[0.13308744132518768, 0.24956104159355164, 0.0...","[0.031040791422128677, -0.06097322329878807, 0..."
...,...,...,...,...,...,...,...,...,...
5819,7.6.2.5,Abc-type heme transporter,"ATP phosphohydrolase (ABC-type, heme-exporting)",Hydrolysis of phosphoric ester,"[[ATP, H2O, heme[side 1]], [ATP, H2O, heme/in]]","[[ADP, heme[side 2], phosphate], [ADP, heme/ou...",MVTVGNYCEAEGPLGPAWAQNGLSPCFFFTLVPSTLMALGALALVL...,"[-0.08002226054668427, 0.2539653182029724, 0.1...","[0.002651719143614173, -0.08383992314338684, -..."
5820,7.6.2.6,Abc-type guanine transporter,"ATP phosphohydrolase (ABC-type, guanine-import...",Hydrolysis of phosphoric ester,"[[ATP, H2O, guanine/out], [ATP, H2O, tryptopha...","[[ADP, guanine/in, phosphate], [ADP, phosphate...",MSLVNIAGLHKHFGSNHVLKGIDLDVQQGDVVALIGRSGSGKSTLL...,"[0.11096367985010147, 0.10357015579938889, -0....","[-0.01749236322939396, -0.051670290529727936, ..."
5821,7.6.2.7,Abc-type taurine transporter,"ATP phosphohydrolase (ABC-type, taurine-import...",Transmembrane transport,"[[ATP, H2O, taurine/out]]","[[ADP, phosphate, taurine/in]]",MRKLINYQPLPLTRGMMGFLPLLALLLVYLMASDARLAANAADKLL...,"[0.11686123162508011, 0.08438248187303543, -0....","[-0.015663068741559982, -0.08384385704994202, ..."
5822,7.6.2.8,Abc-type vitamin b12 transporter,"ATP phosphohydrolase (ABC-type, vitamin B12-im...",Hydrolysis of phosphoric ester,"[[ATP, H2O, vitamin B12/out], [ATP, H2O, cobin...","[[ADP, phosphate, vitaminB12/in], [ADP, [cobal...",QEVKVKDYFGEQTIKLPVSKIIYLGSFAEVPAMFHTWDRVVGISDY...,"[0.04568294808268547, 0.1668546497821808, -0.0...","[0.02499481663107872, -0.0179133377969265, -0...."


In [23]:
enzyme_df.to_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'enzyme_df.pkl'))

# Substrates

### Clean Substrates

### Get SMILES for all substrates of all Kinases

In [3]:
enzyme_df = pd.read_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'enzyme_df.pkl'))

In [26]:
# Initialize a list to store substrates and their Mol objects
substrates_data = []

# Initialize a cache to store previously fetched SMILES and Mol objects
substrate_cache = {}

# Function to get SMILES string from substrate name using PubChem
def get_smiles_from_name(name, retry_attempts=5, retry_delay=5):
    if name in substrate_cache:
        return substrate_cache[name]
    
    attempt = 0
    while attempt < retry_attempts:
        try:
            compounds = pcp.get_compounds(name, 'name')
            if compounds:
                smiles = compounds[0].canonical_smiles
                substrate_cache[name] = smiles
                return smiles
            else:
                return None
        except pcp.PubChemHTTPError as e:
            print(f"PubChem HTTP Error: {e}")
            if e.response.status_code == 503:  # Server busy error
                print(f"Server busy, retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_delay *= 2  # Exponential backoff
                attempt += 1
            else:
                print(f"Error retrieving SMILES for {name}: {e}")
                return None
        except Exception as e:
            print(f"Error retrieving SMILES for {name}: {e}")
            return None
    
    print(f"Failed to retrieve SMILES for {name} after {retry_attempts} attempts")
    return None

# Function to flatten nested lists of substrates
def flatten_substrates(substrates):
    flattened = []
    for substrate in substrates:
        if isinstance(substrate, list):
            flattened.extend(flatten_substrates(substrate))
        else:
            flattened.append(substrate)
    return flattened

# Iterate over each row in enzyme_df
for index, row in enzyme_df.iterrows():
    # Flatten the substrates list
    flattened_substrates = flatten_substrates(row['substrates'])
    
    processed_substrates = set()  # Set to track processed substrates for the current enzyme
    
    for substrate in flattened_substrates:       
        try:
            if substrate in substrate_cache:
                smiles = substrate_cache[substrate]
            else:
                # Convert substrate name to SMILES string using PubChem
                smiles = get_smiles_from_name(substrate)
            if smiles not in processed_substrates:
                substrates_data.append({
                        'enzyme': row['enzyme'],
                        'ec_number': row['ec_number'],
                        'ESM1b': row['ESM1b'],
                        'ESM2': row['ESM2'],
                        'substrate': substrate,
                        'smile': smiles,
                })
                processed_substrates.add(smiles)  # Add substrate to the set
                if smiles:
                    print(row['enzyme'], str(index + 1) + '/' + str(enzyme_df.shape[0]) + ':', substrate, 'added')
                else:
                    print(row['enzyme'], str(index + 1) + '/' + str(enzyme_df.shape[0]) +  ':', substrate, 'NOT FOUND')
        except Exception as e:
            print(f"Error converting substrate {substrate}: {e}")

# Create a DataFrame to store substrates and their Mol objects
substrates_df = pd.DataFrame(substrates_data)

Alcohol dehydrogenase 1 / 5824 : H added
Alcohol dehydrogenase 1 / 5824 : NADH added
Alcohol dehydrogenase 1 / 5824 : acetaldehyde added
Alcohol dehydrogenase 1 / 5824 : methylglyoxal added
Alcohol dehydrogenase 1 / 5824 : 2-dehydro-3-deoxy-D-gluconate added
Alcohol dehydrogenase 1 / 5824 : 4-deoxy-L-erythro-5-hexoseulose NOT FOUND
Alcohol dehydrogenase 1 / 5824 : NAD added
Alcohol dehydrogenase 1 / 5824 : phenylethanol added
Alcohol dehydrogenase 1 / 5824 : cinnamaldehyde added
Alcohol dehydrogenase 1 / 5824 : sinapaldehyde added
Alcohol dehydrogenase 1 / 5824 : ethanol added
Alcohol dehydrogenase 1 / 5824 : allyl alcohol added
Alcohol dehydrogenase 1 / 5824 : octanol added
Alcohol dehydrogenase 1 / 5824 : a primary alcohol added
Alcohol dehydrogenase 1 / 5824 : crotyl alcohol added
Alcohol dehydrogenase 1 / 5824 : all-trans-retinol added
Alcohol dehydrogenase 1 / 5824 : 1-butanol added
L-xylulose reductase 2 / 5824 : NADP added
L-xylulose reductase 2 / 5824 : xylitol added
L-xylulose

In [27]:
substrates_df

Unnamed: 0,enzyme,ec_number,ESM1b,substrate,smiles
0,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",H,[HH]
1,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",NADH,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...
2,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",acetaldehyde,CC=O
3,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",methylglyoxal,CC(=O)C=O
4,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",2-dehydro-3-deoxy-D-gluconate,C(C(C(CO)O)O)C(=O)C(=O)[O-]
...,...,...,...,...,...
13249,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",H2O,O
13250,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",vitamin B12/out,
13251,Abc-type quaternary amine transporter,7.6.2.9,"[0.16964754462242126, 0.11195020377635956, -0....",ATP,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...
13252,Abc-type quaternary amine transporter,7.6.2.9,"[0.16964754462242126, 0.11195020377635956, -0....",H2O,O


In [29]:
substrates_df = substrates_df[substrates_df['smile'].notna()].reset_index(drop=True)

In [30]:
def string_to_float(list):
    return [float(element) for element in list]

In [31]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

# Function to convert a SMILES string to a Morgan fingerprint bit string
def smiles_to_fingerprint(smiles):
    if smiles:
        mol = MolFromSmiles(smiles)
        if mol:
            return string_to_float(list(mfpgen.GetFingerprint(mol).ToBitString()))
    return None

substrates_df['ECFP'] = substrates_df['smile'].apply(smiles_to_fingerprint)

In [32]:
substrates_df

Unnamed: 0,enzyme,ec_number,ESM1b,substrate,smiles,fingerprint
0,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",H,[HH],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",NADH,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",acetaldehyde,CC=O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",methylglyoxal,CC(=O)C=O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",2-dehydro-3-deoxy-D-gluconate,C(C(C(CO)O)O)C(=O)C(=O)[O-],"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
10404,Abc-type taurine transporter,7.6.2.7,"[0.11686123162508011, 0.08438248187303543, -0....",H2O,O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10405,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",ATP,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10406,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",H2O,O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10407,Abc-type quaternary amine transporter,7.6.2.9,"[0.16964754462242126, 0.11195020377635956, -0....",ATP,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [33]:
"""# List of all available descriptors in RDKit
descriptor_names = [desc[0] for desc in Descriptors._descList]

# Create a MolecularDescriptorCalculator
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# Function to compute descriptors for a given SMILES string
def compute_descriptors(smiles):
    try:
        mol = MolFromSmiles(smiles)
        return calculator.CalcDescriptors(mol)
    except:
        return [None] * len(descriptor_names)
        
# Compute descriptors for each substrate
descriptors = substrates_df['smiles'].apply(compute_descriptors)

# Create a DataFrame for descriptors
descriptors_df = pd.DataFrame(descriptors.tolist(), columns=descriptor_names)

# Combine the original DataFrame with the descriptors DataFrame
substrates_df = pd.concat([substrates_df, descriptors_df], axis=1)"""

# Remove duplicates
substrates_df = substrates_df.drop_duplicates(subset = ['ec_number', 'smile']).reset_index(drop=True)

In [14]:
substrates_df

Unnamed: 0,enzyme,ec_number,ESM1b,substrate,smiles,fingerprint,ESM2
0,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",H,[HH],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03778128698468208, 0.019074678421020508, -..."
1,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",NADH,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03778128698468208, 0.019074678421020508, -..."
2,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",acetaldehyde,CC=O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03778128698468208, 0.019074678421020508, -..."
3,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",methylglyoxal,CC(=O)C=O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03778128698468208, 0.019074678421020508, -..."
4,Alcohol dehydrogenase,1.1.1.1,"[-0.09155001491308212, 0.178570955991745, -0.0...",2-dehydro-3-deoxy-D-gluconate,C(C(C(CO)O)O)C(=O)C(=O)[O-],"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03778128698468208, 0.019074678421020508, -..."
...,...,...,...,...,...,...,...
10404,Abc-type taurine transporter,7.6.2.7,"[0.11686123162508011, 0.08438248187303543, -0....",H2O,O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.015663068741559982, -0.08384385704994202, ..."
10405,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",ATP,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02499481663107872, -0.0179133377969265, -0...."
10406,Abc-type vitamin b12 transporter,7.6.2.8,"[0.04568294808268547, 0.1668546497821808, -0.0...",H2O,O,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02499481663107872, -0.0179133377969265, -0...."
10407,Abc-type quaternary amine transporter,7.6.2.9,"[0.16964754462242126, 0.11195020377635956, -0....",ATP,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.004809575621038675, -0.039740920066833496, ..."


In [15]:
substrates_df.to_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'substrates_df.pkl'))

# Negative Points

In [16]:
substrates_df = pd.read_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'substrates_df.pkl'))

In [19]:
def create_negatives(df, lower_bound = 0.0, upper_bound = 1.0, num_of_negs = 3):
    unique_smiles = substrates_df['smile'].unique()
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
    new_rows = []

    for index, row in df.iterrows():
        original_smile = row['smile']
        original_enzyme = row['enzyme']
        similar_substrates = []
        for unique_smile in unique_smiles:
            if original_smile != unique_smile:
                similarity = FingerprintSimilarity(mfpgen.GetFingerprint(MolFromSmiles(original_smile)), mfpgen.GetFingerprint(MolFromSmiles(unique_smile)))
                if (similarity <= upper_bound) and (similarity >= lower_bound):
                    similar_substrates.append(unique_smile)
        for _ in range(num_of_negs):
            try:
                random_negative = random.choice(similar_substrates)
                print('Negative', str(index + 1) + '/' + str(df.shape[0]), df[df['smile'] == random_negative]['substrate'].iloc[0], 'added for', row['substrate'], 'substrate of', row['enzyme'], 'enzyme')
                if not ((df['enzyme'] == original_enzyme) & (df['substrate'] == random_negative)).any():
                    new_row = df[df['smile'] == random_negative].copy().iloc[0]
                    new_row['enzyme'] = row['enzyme']
                    new_row['ec_number'] = row['ec_number']
                    new_row['ESM1b'] = row['ESM1b']
                    new_row['ESM2'] = row['ESM2']
                    new_row['active'] = 0
                    new_rows.append(new_row)
            except IndexError:
                print('No negatives for', row['substrate'], 'substrate of', row['enzyme'], 'enzyme')
                
    df['active'] = 1
    new_df = pd.DataFrame(new_rows)
    new_df = new_df.drop_duplicates(subset=['smile', 'enzyme']).reset_index(drop=True)
    print('Added', new_df.shape[0], 'negative values to', df.shape[0], 'positive values for total dataset size', new_df.shape[0] + df.shape[0])
    return pd.concat([df, new_df], ignore_index=True)

In [20]:
negatives = create_negatives(substrates_df)

Negative 1/10409 GDP-D-mannose added for H substrate of Alcohol dehydrogenase enzyme
Negative 1/10409 7beta-hydroxycholesterol added for H substrate of Alcohol dehydrogenase enzyme
Negative 1/10409 beta-apo-4'-carotenal added for H substrate of Alcohol dehydrogenase enzyme
Negative 2/10409 2-monooleoylglycerol added for NADH substrate of Alcohol dehydrogenase enzyme
Negative 2/10409 fenbendazole added for NADH substrate of Alcohol dehydrogenase enzyme
Negative 2/10409 protocatechuic acid added for NADH substrate of Alcohol dehydrogenase enzyme
Negative 3/10409 desacetylmycothiol added for acetaldehyde substrate of Alcohol dehydrogenase enzyme
Negative 3/10409 cis-4-hydroxytamoxifen added for acetaldehyde substrate of Alcohol dehydrogenase enzyme
Negative 3/10409 4-methyl-5-nitrocatechol added for acetaldehyde substrate of Alcohol dehydrogenase enzyme
Negative 4/10409 13-alpha-hydroxymultiflorine added for methylglyoxal substrate of Alcohol dehydrogenase enzyme
Negative 4/10409 D-glucos

In [21]:
negatives.to_pickle(join(CURRENT_DIR, '..' , 'Data', 'Dataframes', 'balanced_df.pkl'))