Pytorch implementation of SMILES2VEC code

Define some tools that will be used 

In [1]:
import pandas as pd
import numpy as np
import torch
'''
This code defines and trains the SMILES2Vec on a dataset that has 
1) SMILES 2) molecule property 3) train/test split value in columns of a csv file

First we will define some important tools that are used by the MAIN code

Variables:

            vectorize           - A function that converts SMILES character set to a one-hot embedding
            one_hot             - this is a tensor that is #data x #embed_size x #characters,
                                  each SMILES string will have an associated embedding of size (max(smiles) + 50)
                                  each embedding will have a vector of size #characters, where one of them will be
                                  "1" and the others "0" depending on which character is in the SMILES. The first
                                  character is always "!" which does have an integer associated which is the "1" in 
                                  first embedding vector. All the final characters of the string are "E" which also
                                  have an integer associated with them and will have embeddings with an element as "1".
                                  The rest of the characters are all in between, each with their own embedding, and 
                                  and an element of "1" lighting up for the integer associated with that character
            R2_calc             - a tool that calculates the R^2 between predicted and true values

'''
#Vectorize the SMILES strings in one-hot embeddings of size of the
#embedding (len largest SMILES string + 5) and charset (number of unique characters in the SMILES data). 
#One-hot encoding will just light up the character that each part of the SMILES corresponds to (56x27),
#for example the 3rd character might be a '[' which means that the third row will have a "1" in the 27th column
#... etc 
def vectorize(smiles,max_stringsize,charset,char_to_int):
        
        #initialize the vectorized data as the #data, embed_size, len(charset)
        one_hot =  np.zeros((smiles.shape[0], max_stringsize , len(charset)),dtype=np.int8)

        #for each molecule and associated smiles string in data 
        for each_molecule, smile in enumerate(smiles):
            #encode the startchar, the first character in all smiles is "!" and 
            #this will take one position in the embedding vector for this character
            one_hot[each_molecule,0,char_to_int["!"]] = 1
            #encode the rest of the chars, depending on what their integer associated with the character
            for j,c in enumerate(smile):
                one_hot[each_molecule,j+1,char_to_int[c]] = 1
            #Encode endchar
            one_hot[each_molecule,len(smile)+1:,char_to_int["E"]] = 1
        return one_hot

def r2_calc(y_true, y_pred):
    SS_res = torch.sum((y_true - y_pred)**2)
    SS_tot = torch.sum((y_true - torch.mean(y_true))**2)
    epsilon = torch.finfo(y_true.dtype).eps  # Small constant to avoid division by zero
    return 1 - SS_res / (SS_tot + epsilon)

Loading and preprocessing data

In [4]:

'''
This code defines and trains the SMILES2Vec on a dataset that has 
1) SMILES 2) molecule property 3) train/test split value in columns of a csv file

Variables: 

        data             - loaded from a csv file that contains the properties mentioned above
        X_train_smiles   - the SMILES train set extracted using the split column
        X_test_smiles    - the SMILES test set extracted using the split column
        property         - the target property for training
        Y_train          - the train targets extracted using the split column
        Y_test           - the test targets extracted using the split column
        charset          - the set of unique SMILES characters in the data, extracted by using "set" on 
                           list joining all SMILES, "".join(list(data.smiles))
        char_to_int      - dictionary that maps char to integers, dict((c,i) for i,c in enumerate(charset))
        int_to_char      - dictionary that maps integers to chars, dict((i,c) for i,c in enumerate(charset))
        embed _size      - the size of the embedding which is taken to be the size of the LARGEST SMILES + 50
        X_train_embed    - the embedded training SMILES retrieved using the vectorize tool  
        X_test_embed     - the embedded testing SMILES retrieved using the vectorize tool
        X_train_tensor   - the tensor verion of X_train_embed
        X_test_tensor    - the tensor version of X_test_embed
        mol_str_train    - the SMILES train set which is now embedded and has a starting "!" and
                           "E" for all the ending charachters
        mol_str_test     - the SMILES test set which is now embedded and has a starting "!" and
                           "E" for all the ending charachters
'''

#LOAD the SMILESvsProperties data
data = pd.read_csv('data/IGC50.csv')

#Split the data into training and testing
X_train_smiles = np.array(list(data["smiles"][data["split"]==1]))
X_test_smiles = np.array(list(data["smiles"][data["split"]==0]))


property = "Activity"  
Y_train = data[property][data["split"]==1].values.reshape(-1,1)
Y_test = data[property][data["split"]==0].values.reshape(-1,1)

#Get the smiles vocabulary from the SMILES dataset
# charset --> the set of characters in the SMILES
# char_to_int --> dictionary that maps characters to integers
# int_to_char --> dictionary that maps integers back to characters
# embed --> the length of the longest SMILES string in the dataset + 5
charset = set("".join(list(data.smiles))+"!E")
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
max_stringsize = max([len(smile) for smile in data.smiles]) + 50

X_train_embed = vectorize(X_train_smiles,max_stringsize,charset,char_to_int)
X_test_embed = vectorize(X_test_smiles,max_stringsize,charset,char_to_int)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train_embed).long()  # Assuming X_train is integer-based data
Y_train_tensor = torch.from_numpy(Y_train).float()  # Assuming Y_train is float-based data

X_test_tensor = torch.from_numpy(X_test_embed).long()  # Assuming X_test is integer-based data
Y_test_tensor = torch.from_numpy(Y_test).float()  # Assuming Y_test is float-based data

#CONVERT embeddings (hot encodings) back to SMILES
mol_str_train=[]
mol_str_test=[]
for x in range(X_train_embed.shape[0]):
    mol_str_train.append("".join([int_to_char[idx] for idx in np.argmax(X_train_embed[x,:,:], axis=1)]))
    
for x in range(len(charset)):
    mol_str_test.append("".join([int_to_char[idx] for idx in np.argmax(X_test_embed[x,:,:], axis=1)]))
vocab_size=len(charset)


print(X_train_tensor.shape)
print(Y_train_tensor.shape)
print(X_test_tensor.shape)
print(Y_test_tensor.shape)

torch.Size([1434, 102, 27])
torch.Size([1434, 1])
torch.Size([358, 102, 27])
torch.Size([358, 1])


PyTorch Model

In [5]:
import torch.nn as nn

class smiles2vec(nn.Module):
    def __init__(self,vocab_size,embed_size, hidden_size1, hidden_size2):
        super(smiles2vec, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru1 = nn.GRU(input_size=embed_size, hidden_size=hidden_size1//2, num_layers=2, batch_first=True, dtype=torch.float32, bidirectional=True)
        self.gru2 = nn.GRU(input_size=hidden_size1, hidden_size=hidden_size2//2, num_layers=2, batch_first=True, dtype=torch.float32, bidirectional=True)
        self.fc = nn.Linear(hidden_size2, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        h1 = torch.zeros(4, x.size(0), hidden_size1//2).to(x.device)
        x, _ = self.gru1(x, h1)
        h2 = torch.zeros(4, x.size(0), hidden_size2//2).to(x.device)
        x, _ = self.gru2(x, h2)
        x = self.fc(x[:, -1, :])
        return x
    
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

#Define hyperparameters
vocab_size = vocab_size
embed_size = 50
hidden_size1 = 112*2
hidden_size2 = 192*2

# Define your PyTorch model
model = smiles2vec(vocab_size,embed_size,hidden_size1,hidden_size2)  # Assuming 'vocab_size' and 'embed' are defined

# Define your optimizer and learning rate
optimizer = Adam(model.parameters(), lr=0.0001)

# Define your learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-15, verbose=True)

# Define your loss function
loss_function = nn.MSELoss()

# Assuming X_train, Y_train, X_test, Y_test are PyTorch tensors or converted to PyTorch tensors

# Create DataLoader for training and validation data
train_dataset = TensorDataset(torch.argmax(X_train_tensor, dim=2), Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=False)

val_dataset = TensorDataset(torch.argmax(X_test_tensor, dim=2), Y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=20)

# Training loop
trainloss_profile = []  # For logging
valloss_profile = []
num_epochs = 100

for epoch in range(num_epochs):

    model.train()
    train_loss = 0.0
    
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
#        train_loss += loss.item()
        
    
    print(len(train_loader.dataset))
    train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
#            val_loss += loss.item()
    
    val_loss /= len(val_loader.dataset)
    
    print('epoch',epoch)
    print('train_loss',train_loss)
    print('val_loss',val_loss)

    # Learning rate scheduler step
    scheduler.step(val_loss)
    
    # Logging
    trainloss_profile.append([epoch,train_loss])
    valloss_profile.append([epoch,val_loss])
    
    # Checkpointing
#    if val_loss < best_val_loss:  # Save best model based on validation loss
#        best_val_loss = val_loss
#        torch.save(model.state_dict(), 'weights_best.pth')


1434
epoch 0
train_loss 7.086065298343802
val_loss 2.8373842492449883
1434
epoch 1
train_loss 1.353079978392214
val_loss 1.3359108397414565
1434
epoch 2
train_loss 1.4200400503110686
val_loss 0.9782402222382955
1434
epoch 3
train_loss 1.174914565711507
val_loss 0.9735858853302854
1434
epoch 4
train_loss 1.1347138918260817
val_loss 0.9670308195678882
1434
epoch 5
train_loss 1.1658954773321644
val_loss 0.9661069499047775
1434
epoch 6
train_loss 1.1492688690768127
val_loss 0.9646739310392455
1434
epoch 7
train_loss 1.155073072454753
val_loss 0.964707285665267
1434
epoch 8
train_loss 1.1556867786697431
val_loss 0.9647281246478331
1434
epoch 9
train_loss 1.1555895173566277
val_loss 0.9646690351337028
1434
epoch 10
train_loss 1.1570163090691241
val_loss 0.9646746702700354
1434
epoch 11
train_loss 1.1572257172279943
val_loss 0.9646586486081171
1434
epoch 12
train_loss 1.1579608336984695
val_loss 0.9646552235054571
Epoch 00013: reducing learning rate of group 0 to 5.0000e-05.
1434
epoch 13
tra