Article from Philippe:
https://practicalcheminformatics.blogspot.com/2023/06/getting-real-with-molecular-property.html


In [5]:
# for running in colab:

# !pip install rdkit
# !pip install deepchem


# !wget https://raw.githubusercontent.com/NinaB99/AI-for-Chemistry/main/Data/ADME_public_set_3521.csv
# !wget https://raw.githubusercontent.com/NinaB99/AI-for-Chemistry/main/Data/11095_2013_1222_MOESM2_ESM.csv

# load biogen data
# import pandas as pd

# biogen_data=pd.read_csv("ADME_public_set_3521.csv")
# print(biogen_data.columns)

# #load bioavailabity data
# bio_avail_data = pd.read_csv("11095_2013_1222_MOESM2_ESM.csv",sep=";")
# print(bio_avail_data.columns)

In [6]:
# load biogen data
import pandas as pd

biogen_data=pd.read_csv("../Data/ADME_public_set_3521.csv")
print(biogen_data.columns)

#load bioavailabity data
bio_avail_data = pd.read_csv("../Data/11095_2013_1222_MOESM2_ESM.csv",sep=";")
print(bio_avail_data.columns)



Index(['Internal ID', 'Vendor ID', 'SMILES', 'CollectionName',
       'LOG HLM_CLint (mL/min/kg)', 'LOG MDR1-MDCK ER (B-A/A-B)',
       'LOG SOLUBILITY PH 6.8 (ug/mL)',
       'LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound)',
       'LOG PLASMA PROTEIN BINDING (RAT) (% unbound)',
       'LOG RLM_CLint (mL/min/kg)'],
      dtype='object')
Index(['No', 'Name', 'Updated SMILES', '%F', 'logK(%F)', 'Category', 'Source'], dtype='object')


use feed-forward NN using pytorch.
train on solubility from dataset 1 first.
then optimize for bioavailability

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from deepchem.feat import RDKitDescriptors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler


#### DATA PREPARATION ####

# Function to generate features from SMILES strings using RDKit descriptors
def generate_features(smiles_list):
    featurizer = RDKitDescriptors()
    features = featurizer.featurize(smiles_list)
    # Drop features containing invalid values
    features = features[:, ~np.isnan(features).any(axis=0)]
    return features

#remove nan values from data
biogen_data = biogen_data.dropna(subset=['LOG SOLUBILITY PH 6.8 (ug/mL)'])

#get x and y data (x is the molecular descriptors, y is the solubility)
y_data = biogen_data["LOG SOLUBILITY PH 6.8 (ug/mL)"]

print(len(y_data))

# Generate features from SMILES data (get smiles from df)
smiles = biogen_data["SMILES"]
X_data = generate_features(smiles)

#split data into training and validation using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
# Convert y pandas Series to NumPy array
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

#scale x values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#convert data to pytorch tensors (like numpy arrays but for pytorch)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Reshape the target tensor to match the shape of the output tensor
y_train_tensor = y_train_tensor.view(-1, 1)
# Reshape the target tensor to match the shape of the output tensor
y_test_tensor = y_test_tensor.view(-1, 1)


2173


In [10]:
#### CREATING MODEL ####

#haven't tried it with the newest settings

# Define the neural network architecture
class SolubilityPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SolubilityPredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define hyperparameters
input_dim = X_data.shape[1]  # Number of molecular descriptors
hidden_dim = 128   #changed from 64 to 128, try it out
output_dim = 1
learning_rate = 0.002 #changed from 0.001, try it out
num_epochs = 400
batch_size = 32

#create dataloader for batch training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#initialize model, loss function (nn.MSELoss) and optimizer
model = SolubilityPredictor(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

##### Grid-search for best hyperparameters #####
# Define your hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (256,)],  # Number of neurons in the hidden layer(s)
    'activation': ['relu', 'tanh'],  # Activation function
    'solver': ['adam', 'sgd'],  # Optimization algorithm
    'learning_rate': ['constant', 'adaptive'],  # Learning rate schedule
}

# Create an MLPRegressor object
mlp = MLPRegressor(max_iter=num_epochs, batch_size=batch_size)

# Perform grid search
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_score = model.score(X_test, y_test)
print("Test Score (R^2):", test_score)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




: 

: 

In [None]:

#training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")


In [17]:

# Evaluation
with torch.no_grad():
    val_outputs = model(X_test_tensor)
    val_loss = criterion(val_outputs, y_test_tensor)
    print(f"Validation Loss: {val_loss.item()}")

Validation Loss: 0.42376625537872314
