Pytorch implementation of SMILES2VEC code

Read in the data of SMILES strings with their properties

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/IGC50.csv')

Split the data into training and testing

In [2]:
X_train_smiles = np.array(list(data["smiles"][data["split"]==1]))
X_test_smiles = np.array(list(data["smiles"][data["split"]==0]))


assay = "Activity"  
Y_train = data[assay][data["split"]==1].values.reshape(-1,1)
Y_test = data[assay][data["split"]==0].values.reshape(-1,1)

print(X_train_smiles.shape,Y_train.shape)
print(X_test_smiles.shape,Y_test.shape)

(1434,) (1434, 1)
(358,) (358, 1)


Get the smiles vocabulary from the SMILES dataset
    charset --> the set of characters in the SMILES
    char_to_int --> dictionary that maps characters to integers
    int_to_char --> dictionary that maps integers back to characters
    embed --> the length of the longest SMILES string in the dataset + 5

In [3]:
charset = set("".join(list(data.smiles))+"!E")

char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
embed = max([len(smile) for smile in data.smiles]) + 50
print (str(charset))
print(len(charset), embed)
char_to_int 


{'+', '=', '/', '\\', '2', '1', 'l', '4', 'N', 'S', 'r', 'B', 'E', '[', 'C', ')', 'I', '#', 'F', '!', 'P', '3', 'O', '-', ']', 'H', '('}
27 102


{'+': 0,
 '=': 1,
 '/': 2,
 '\\': 3,
 '2': 4,
 '1': 5,
 'l': 6,
 '4': 7,
 'N': 8,
 'S': 9,
 'r': 10,
 'B': 11,
 'E': 12,
 '[': 13,
 'C': 14,
 ')': 15,
 'I': 16,
 '#': 17,
 'F': 18,
 '!': 19,
 'P': 20,
 '3': 21,
 'O': 22,
 '-': 23,
 ']': 24,
 'H': 25,
 '(': 26}

Vectorize the SMILES strings in one-hot embeddings of size of the
embedding (len largest SMILES string + 5) and charset (number of unique characters in the SMILES data). One-hot encoding will just light up the character that each part of the SMILES corresponds to (56x27), for example the 3rd character might be a '[' which means that the third row will have a "1" in the 27th column... etc 

In [4]:
def vectorize(smiles):
        one_hot =  np.zeros((smiles.shape[0], embed , len(charset)),dtype=np.int8)
        for i,smile in enumerate(smiles):
            #encode the startchar
            one_hot[i,0,char_to_int["!"]] = 1
            #encode the rest of the chars
            for j,c in enumerate(smile):
                one_hot[i,j+1,char_to_int[c]] = 1
            #Encode endchar
            one_hot[i,len(smile)+1:,char_to_int["E"]] = 1
        #Return two, one for input and the other for output
        return one_hot[:,0:-1,:], one_hot[:,1:,:]

X_train, _ = vectorize(X_train_smiles)
X_test, _ = vectorize(X_test_smiles)



print(X_train.shape)

import torch
# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train).long()  # Assuming X_train is integer-based data
Y_train_tensor = torch.from_numpy(Y_train).float()  # Assuming Y_train is float-based data

print(Y_train.shape)

X_test_tensor = torch.from_numpy(X_test).long()  # Assuming X_test is integer-based data
Y_test_tensor = torch.from_numpy(Y_test).float()  # Assuming Y_test is float-based data


(1434, 101, 27)
(1434, 1)


Conversion of one-hot embeddings back to SMILES

In [5]:
mol_str_train=[]
mol_str_test=[]
for x in range(1434):
 
    mol_str_train.append("".join([int_to_char[idx] for idx in np.argmax(X_train[x,:,:], axis=1)]))
    
    
for x in range(358):
    mol_str_test.append("".join([int_to_char[idx] for idx in np.argmax(X_test[x,:,:], axis=1)]))
vocab_size=len(charset)

print(vocab_size)

27


Pytorch model

In [6]:
import torch.nn as nn

class smiles2vec(nn.Module):
    def __init__(self, vocab_size, embed):
        super(smiles2vec, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, 50, padding_idx=0)
        self.gru = nn.GRU(input_size=50, hidden_size=50, num_layers=1, batch_first=True)
        
        self.conv1 = nn.Conv1d(50, 192, kernel_size=10)
        self.bn1 = nn.BatchNorm1d(192)
        self.conv2 = nn.Conv1d(192, 192, kernel_size=5)
        self.conv3 = nn.Conv1d(192, 192, kernel_size=3)
        
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16512, 100)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.fc2 = nn.Linear(100, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x_emb = x
        x = x.permute(0, 2, 1)  # Conv1D in PyTorch expects channels as the second dimension
        x = self.conv1(x)
        x_conv1 = x
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.relu(x)
        x_final = x
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        
        return x, x_emb, x_conv1, x_final


The definition of R^2, coefficient of determination

In [7]:
import torch

def coeff_determination(y_true, y_pred):
    SS_res = torch.sum((y_true - y_pred)**2)
    SS_tot = torch.sum((y_true - torch.mean(y_true))**2)
    epsilon = torch.finfo(y_true.dtype).eps  # Small constant to avoid division by zero
    return 1 - SS_res / (SS_tot + epsilon)


Fit this model

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Assuming you've defined your PyTorch model class 'CustomModel' previously

# Define your PyTorch model
model = smiles2vec(vocab_size, embed)  # Assuming 'vocab_size' and 'embed' are defined

# Define your optimizer and learning rate
optimizer = Adam(model.parameters(), lr=0.01)

# Define your learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-15, verbose=True)

# Define your loss function
loss_function = nn.MSELoss()

# Assuming X_train, Y_train, X_test, Y_test are PyTorch tensors or converted to PyTorch tensors

# Create DataLoader for training and validation data
train_dataset = TensorDataset(torch.argmax(X_train_tensor, dim=2), Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1434, shuffle=False)

val_dataset = TensorDataset(torch.argmax(X_test_tensor, dim=2), Y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=358)

# Training loop
trainloss_profile = []  # For logging
valloss_profile = []
num_epochs = 150

for epoch in range(num_epochs):

    model.train()
    train_loss = 0.0
    
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs, x_emb_train, x_conv1_train, x_final_train = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
#        train_loss += loss.item()
        
    
    train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs, x_emb_val, x_conv1_val, x_final_val = model(inputs)
            loss = loss_function(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
#            val_loss += loss.item()
    
    val_loss /= len(val_loader.dataset)
    
    print('epoch',epoch)
    print('train_loss',train_loss)
    print('val_loss',val_loss)

    # Learning rate scheduler step
    scheduler.step(val_loss)
    
    # Logging
    trainloss_profile.append([epoch,train_loss])
    valloss_profile.append([epoch,val_loss])
    
    # Checkpointing
#    if val_loss < best_val_loss:  # Save best model based on validation loss
#        best_val_loss = val_loss
#        torch.save(model.state_dict(), 'weights_best.pth')


epoch 0
train_loss 12.203672409057617
val_loss 8687454.0
epoch 1
train_loss 1277201.375
val_loss 122.4376449584961
epoch 2
train_loss 23.079914093017578
val_loss 2.278876781463623
epoch 3
train_loss 2.4289116859436035
val_loss 1.055906057357788
epoch 4
train_loss 1.4542559385299683
val_loss 1.6048904657363892
epoch 5
train_loss 2.3328697681427
val_loss 1.166975975036621
epoch 6
train_loss 1.529219627380371
val_loss 15.166770935058594
epoch 7
train_loss 15.002808570861816
val_loss 1.945725917816162
epoch 8
train_loss 3.1420912742614746
val_loss 0.9938417673110962
epoch 9
train_loss 1.5830273628234863
val_loss 0.965554416179657
epoch 10
train_loss 1.6207494735717773
val_loss 0.9661445617675781
epoch 11
train_loss 1.744321584701538
val_loss 0.9688542485237122
epoch 12
train_loss 1.7924036979675293
val_loss 1.0060853958129883
epoch 13
train_loss 1.7762119770050049
val_loss 1.0736949443817139
epoch 14
train_loss 1.7937206029891968
val_loss 1.1149448156356812
epoch 15
train_loss 1.8930420875

KeyboardInterrupt: 

In [None]:
len(train_loader.dataset)

Stack training and validation "embeddings" and save them

In [9]:
print(x_conv1_val[0].shape)

torch.Size([192, 92])


In [10]:
x_conv1_all = np.vstack((x_conv1_train.detach().numpy(),x_conv1_val.detach().numpy()))

#np.savetxt('x_finalembs_trainval.csv',delimiter=',')

In [12]:

X_inp_all = np.vstack((torch.argmax(X_train_tensor, dim=2),torch.argmax(X_test_tensor, dim=2)))

print(X_inp_all.shape)
print(x_conv1_all.shape)

(1792, 101)
(1792, 192, 92)


Extract "embedding"

In [38]:
#run through x_inp, find target vocabulary elements, find their xemb using integer,
#and the x_conv1, x_conv2... etc
#X_inp = torch.argmax(X_test_tensor, dim=2)

#14 corresponds to "O"
target_vocab = 19

#print(x_emb.shape)

save_emb = []
for each_molecule in range(1792):
    for each_character in range(X_inp_all.shape[1]): 

        if X_inp_all[each_molecule][each_character] == target_vocab:
            
            #get the embedding of the vocabulary element,
            #Check that this is a constant vector
#            embedding = x_emb_train[each_molecule][each_character]

            #get the convolutional operation on the character, 
            #for every molecule the vocab element is encountered
#            conv1 = x_conv1[each_molecule,:,each_character]
#            print(conv1)

            #get the convolutional operation on the character, 
            #for every molecule the vocab element is encountered
            final = x_conv1_all[each_molecule,:,each_character]
            print(final.shape)

            finalindexed = np.concatenate((final,[each_molecule]))

            save_emb.append(finalindexed)

            

(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)
(192,)

In [40]:
np.shape(save_emb)

(2589, 193)

In [39]:
np.savetxt('conv1embs.csv',save_emb,delimiter=',')