<a href="https://colab.research.google.com/github/Swayamprakashpatel/Sol_ME/blob/main/GNN_Solubility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SMILE to Graph Neural Network for SOluhbilty Prediction

In [1]:
!pip install rdkit-pypi tensorflow tensorflow-addons


Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, rdkit-pypi, tensorflow-addons
  Attempting u

In [2]:
!pip install rdkit-pypi tensorflow tensorflow-addons pubchempy


Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=16f1605876606fbf4b31f66f739f04f1a62be4dbefc37a219c687590a7eb5241
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [3]:
!pip install pubchempy



In [11]:
import pandas as pd
import pubchempy as pcp
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Input, Add
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# Load the dataset
df = pd.read_csv('/content/GNN.csv')

# Function to convert PubChem CID to SMILES
def cid_to_smiles(cid):
    try:
        compound = pcp.Compound.from_cid(cid)
        return compound.canonical_smiles
    except Exception as e:
        print(f"Error converting CID {cid}: {e}")
        return None

# Convert CIDs to SMILES in the dataset
df['drug_smiles'] = df['drug_cid'].apply(cid_to_smiles)
df['solvent_smiles'] = df['solvent_cid'].apply(cid_to_smiles)

# Function to convert SMILES to a graph
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    AllChem.Compute2DCoords(mol)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append((i, j))
        edges.append((j, i))
    atom_features = np.array(atoms, dtype=np.float32).reshape(-1, 1)
    edge_index = np.array(edges, dtype=np.int32)
    return atom_features, edge_index

# Process dataset into graph objects
data_list = []
solubilities = []
for index, row in df.iterrows():
    drug_graph = smiles_to_graph(row['drug_smiles'])
    solvent_graph = smiles_to_graph(row['solvent_smiles'])
    if drug_graph is not None and solvent_graph is not None:
        data_list.append(drug_graph)
        solubilities.append(row['solubility'])

# Split data into training and testing sets
split_index = int(0.8 * len(data_list))
train_data = data_list[:split_index]
train_labels = solubilities[:split_index]
test_data = data_list[split_index:]
test_labels = solubilities[split_index:]

# Custom GCN Layer
class GraphConvolution(tf.keras.layers.Layer):
    def __init__(self, units):
        super(GraphConvolution, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[0][-1], self.units),
                                      initializer='glorot_uniform',
                                      trainable=True)

    def call(self, inputs):
        features, adj = inputs
        return tf.matmul(adj, tf.matmul(features, self.kernel))

# Define the GNN model using custom GCN layers
def create_gnn_model(input_shape):
    features = Input(shape=(input_shape[1], 1))
    adj = Input(shape=(input_shape[1], input_shape[1]))

    x = GraphConvolution(16)([features, adj])
    x = tf.nn.relu(x)
    x = GraphConvolution(32)([x, adj])
    x = tf.nn.relu(x)
    x = GraphConvolution(1)([x, adj])
    x = Flatten()(x)
    output = Dense(1)(x)

    model = Model(inputs=[features, adj], outputs=output)
    model.compile(optimizer='adam', loss='mse')
    return model

# Prepare data for training
def prepare_data(data_list, labels):
    atom_features = [data[0] for data in data_list]
    edge_indices = [data[1] for data in data_list]
    num_nodes = max([features.shape[0] for features in atom_features])

    atom_features_padded = []
    adj_matrices = []

    for i in range(len(atom_features)):
        feature_padded = np.zeros((num_nodes, 1))
        feature_padded[:atom_features[i].shape[0], :] = atom_features[i]
        atom_features_padded.append(feature_padded)

        adj_matrix = np.zeros((num_nodes, num_nodes))
        for edge in edge_indices[i]:
            adj_matrix[edge[0], edge[1]] = 1.0
        adj_matrices.append(adj_matrix)

    atom_features_padded = np.array(atom_features_padded, dtype=np.float32)
    adj_matrices = np.array(adj_matrices, dtype=np.float32)
    labels = np.array(labels, dtype=np.float32)

    return atom_features_padded, adj_matrices, labels

train_atom_features, train_adj_matrices, train_labels = prepare_data(train_data, train_labels)
test_atom_features, test_adj_matrices, test_labels = prepare_data(test_data, test_labels)

# Create and train the model
model = create_gnn_model(train_atom_features.shape)
batch_size = 32
epochs = 100

model.fit([train_atom_features, train_adj_matrices], train_labels, batch_size=batch_size, epochs=epochs, verbose=1)
test_loss = model.evaluate([test_atom_features, test_adj_matrices], test_labels, verbose=0)
print(f'Test Loss: {test_loss:.4f}')

# Save the trained model
model.save('gnn_model_tf.h5')


Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 932: 'PUGREST.ServerBusy'
Error converting CID 2162: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2202: 'PUGREST.ServerBusy'
Error converting CID 2519: 'PUGREST.ServerBusy'
Error converting CID 2519: 'PUGREST.ServerBusy'
Erro

KeyboardInterrupt: 

In [10]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow_addons.layers import GCNConv
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# Load the dataset
df = pd.read_csv('/content/GNN.csv')

# Function to convert SMILES to a graph
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    AllChem.Compute2DCoords(mol)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append((i, j))
        edges.append((j, i))
    atom_features = np.array(atoms, dtype=np.float32).reshape(-1, 1)
    edge_index = np.array(edges, dtype=np.int32)
    return atom_features, edge_index

# Process dataset into graph objects
data_list = []
solubilities = []
for index, row in df.iterrows():
    drug_graph = smiles_to_graph(row['drug_smiles'])
    solvent_graph = smiles_to_graph(row['solvent_smiles'])
    if drug_graph is not None and solvent_graph is not None:
        data_list.append(drug_graph)
        solubilities.append(row['solubility'])

# Split data into training and testing sets
split_index = int(0.8 * len(data_list))
train_data = data_list[:split_index]
train_labels = solubilities[:split_index]
test_data = data_list[split_index:]
test_labels = solubilities[split_index:]

# Define the GNN model using TensorFlow Sequential API
def create_gnn_model():
    model = Sequential()
    model.add(GCNConv(16, activation='relu'))
    model.add(GCNConv(32, activation='relu'))
    model.add(GCNConv(1))
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

model = create_gnn_model()

# Prepare data for training
def prepare_data(data_list, labels):
    atom_features = [data[0] for data in data_list]
    edge_indices = [data[1] for data in data_list]
    atom_features = tf.ragged.constant(atom_features, dtype=tf.float32)
    edge_indices = tf.ragged.constant(edge_indices, dtype=tf.int32)
    labels = np.array(labels, dtype=np.float32)
    return atom_features, edge_indices, labels

train_atom_features, train_edge_indices, train_labels = prepare_data(train_data, train_labels)
test_atom_features, test_edge_indices, test_labels = prepare_data(test_data, test_labels)

# Training loop
batch_size = 32
epochs = 100

for epoch in range(epochs):
    model.fit([train_atom_features, train_edge_indices], train_labels, batch_size=batch_size, epochs=1, verbose=1)
    test_loss = model.evaluate([test_atom_features, test_edge_indices], test_labels, verbose=0)
    print(f'Epoch {epoch+1}, Test Loss: {test_loss:.4f}')

# Save the trained model
model.save('gnn_model_tf.h5')



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.17.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


ModuleNotFoundError: No module named 'keras.src.engine'

In [None]:
import pandas as pd
import requests
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Input
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# Load the dataset
df = pd.read_csv('/content/GNN.csv')

# Function to convert PubChem CID to SMILES using PubChem REST API
def cid_to_smiles(cid):
    try:
        url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/TXT'
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text.strip()
    except Exception as e:
        print(f"Error converting CID {cid}: {e}")
        return None

# Convert CIDs to SMILES in the dataset
df['drug_smiles'] = df['drug_cid'].apply(cid_to_smiles)
df['solvent_smiles'] = df['solvent_cid'].apply(cid_to_smiles)

# Function to convert SMILES to a graph
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    AllChem.Compute2DCoords(mol)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append((i, j))
        edges.append((j, i))
    atom_features = np.array(atoms, dtype=np.float32).reshape(-1, 1)
    edge_index = np.array(edges, dtype=np.int32)
    return atom_features, edge_index

# Process dataset into graph objects
data_list = []
solubilities = []
for index, row in df.iterrows():
    drug_graph = smiles_to_graph(row['drug_smiles'])
    solvent_graph = smiles_to_graph(row['solvent_smiles'])
    if drug_graph is not None and solvent_graph is not None:
        data_list.append(drug_graph)
        solubilities.append(row['solubility'])

# Split data into training and testing sets
split_index = int(0.8 * len(data_list))
train_data = data_list[:split_index]
train_labels = solubilities[:split_index]
test_data = data_list[split_index:]
test_labels = solubilities[split_index:]

# Custom GCN Layer
class GraphConvolution(tf.keras.layers.Layer):
    def __init__(self, units):
        super(GraphConvolution, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[0][-1], self.units),
                                      initializer='glorot_uniform',
                                      trainable=True)

    def call(self, inputs):
        features, adj = inputs
        return tf.matmul(adj, tf.matmul(features, self.kernel))

# Define the GNN model using custom GCN layers
def create_gnn_model(input_shape):
    features = Input(shape=(input_shape[1], 1))
    adj = Input(shape=(input_shape[1], input_shape[1]))

    x = GraphConvolution(16)([features, adj])
    x = tf.nn.relu(x)
    x = GraphConvolution(32)([x, adj])
    x = tf.nn.relu(x)
    x = GraphConvolution(1)([x, adj])
    x = Flatten()(x)
    output = Dense(1)(x)

    model = Model(inputs=[features, adj], outputs=output)
    model.compile(optimizer='adam', loss='mse')
    return model

# Prepare data for training
def prepare_data(data_list, labels):
    atom_features = [data[0] for data in data_list]
    edge_indices = [data[1] for data in data_list]
    num_nodes = max([features.shape[0] for features in atom_features])

    atom_features_padded = []
    adj_matrices = []

    for i in range(len(atom_features)):
        feature_padded = np.zeros((num_nodes, 1))
        feature_padded[:atom_features[i].shape[0], :] = atom_features[i]
        atom_features_padded.append(feature_padded)

        adj_matrix = np.zeros((num_nodes, num_nodes))
        for edge in edge_indices[i]:
            adj_matrix[edge[0], edge[1]] = 1.0
        adj_matrices.append(adj_matrix)

    atom_features_padded = np.array(atom_features_padded, dtype=np.float32)
    adj_matrices = np.array(adj_matrices, dtype=np.float32)
    labels = np.array(labels, dtype=np.float32)

    return atom_features_padded, adj_matrices, labels

train_atom_features, train_adj_matrices, train_labels = prepare_data(train_data, train_labels)
test_atom_features, test_adj_matrices, test_labels = prepare_data(test_data, test_labels)

# Create and train the model
model = create_gnn_model(train_atom_features.shape)
batch_size = 32
epochs = 100

model.fit([train_atom_features, train_adj_matrices], train_labels, batch_size=batch_size, epochs=epochs, verbose=1)
test_loss = model.evaluate([test_atom_features, test_adj_matrices], test_labels, verbose=0)
print(f'Test Loss: {test_loss:.4f}')

# Save the trained model
model.save('gnn_model_tf.h5')
