In [64]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.nn as nn
from pathlib import Path
import pandas as pd
import numpy as np
import random
import torch

from torch.utils.data import DataLoader,SubsetRandomSampler
from notebooks.library.GCN import ConvolutionLayer, PoolingLayer, GraphData, collate_graph_dataset, Standardizer, Graph

In [6]:
# Load QM9 SMILES
df_qm9 = pd.read_pickle('data/RDKit/rdkit_only_valid_smiles_qm9.pkl')
smiles_list = df_qm9["SMILES"].to_list()

In [7]:

# Collect all unique characters
charset = set()
for smi in smiles_list:
    for ch in smi:
        charset.add(ch)

# Sort for consistency
charset = sorted(list(charset))

# Add special tokens
special_tokens = ['<PAD>', '<END>', '<STR>']
vocab_list = special_tokens + charset

# Create token -> index mapping
token2idx = {tok: idx for idx, tok in enumerate(vocab_list)}
idx2token = {idx: tok for tok, idx in token2idx.items()}

print("Vocabulary size:", len(vocab_list))
print("Example tokens:", vocab_list)

Vocabulary size: 24
Example tokens: ['<PAD>', '<END>', '<STR>', '#', '(', ')', '+', '-', '/', '1', '2', '3', '4', '5', '=', '@', 'C', 'F', 'H', 'N', 'O', '[', '\\', ']']


In [8]:
# Settings
batch_size = 4
seq_len = 15                     # sequence length for SMILES
vocab_size = len(vocab_list)     # small example vocab
latent_dim = 16
gru_dim = 32
embedding_dim = 8
n_layers = 1

device = 'cpu'      # or 'cuda' if GPU is available

# Dummy input tokens (batch of sequences)
x = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)

# Dummy target property (HOMO-LUMO gap, for example)
y = torch.rand(batch_size, 1).to(device)

In [9]:
print(x)
print(y)

tensor([[23, 15, 23, 12,  0, 20,  9, 22, 16, 13, 19, 18,  6,  6,  0],
        [15, 14, 19, 15, 22,  1, 23,  1, 22, 18,  3,  9,  3, 18,  6],
        [ 8, 18, 19, 10, 20,  3,  8,  5,  9,  1,  4, 17, 22,  0,  1],
        [ 2, 10, 16, 23,  8, 15, 23,  0, 11, 15,  7, 22, 13,  1, 10]])
tensor([[0.2274],
        [0.2330],
        [0.8268],
        [0.0418]])


In [10]:
torch.ones(5, dtype=torch.long).to(device) * 2

tensor([2, 2, 2, 2, 2])

In [12]:
decoder = GRU_Decoder(
    vocab_size=vocab_size,
    latent_dim=latent_dim,
    gru_size=gru_dim,
    n_layers=n_layers,
    embedding_dim=embedding_dim
).to(device)

encoder = DummyEncoder(gru_dim=gru_dim).to(device)

model = cVAE(
    encoder=encoder,
    decoder=decoder,
    device=device,
    latent_dim=latent_dim,
    gru_dim=gru_dim,
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    teacher_forcing_ratio=0.5
).to(device)

In [13]:
def loss_function(model, logits, targets, batch_size, beta=1):
    recon_loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    loss_recon = recon_loss_fn(logits, targets)
    
    kl_loss = -0.5 * torch.sum(1 + model.z_logvar - model.z_mean.pow(2) - model.z_logvar.exp()) / batch_size
    loss = loss_recon + beta * kl_loss

    return loss

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
outputs = model(x, y)  # [batch, seq_len, vocab_size]

# for x, y in ...:

optimizer.zero_grad()

logits = outputs.view(-1, vocab_size)
targets = x.view(-1)

loss = loss_function(model, logits, targets, x.size(0))

loss.backward()
optimizer.step()

print(outputs.argmax(-1))

AttributeError: module 'torch' has no attribute '_utils'

In [None]:
idx = 2

print("".join([idx2token[i] for i in x[idx].numpy()][1:]))
print("".join([idx2token[i] for i in outputs[idx].argmax(1).numpy()][1:]))

////<STR>//<STR>/////<STR>
4<STR>@C<STR>\OH<END>(F+H[


In [None]:
with torch.no_grad():
    outputs = model(x, y)  # [batch, seq_len, vocab_size]
    
print("Output shape:", outputs.shape)

Output shape: torch.Size([4, 15, 24])


In [None]:
# pick the most probable token at each step
pred_tokens = outputs.argmax(-1)
print("Predicted token indices:\n", pred_tokens)

Predicted token indices:
 tensor([[ 2,  3,  3,  3,  3,  3, 13,  3,  3, 13,  3,  3,  3,  3, 13],
        [ 2,  9,  9,  9,  9, 11, 11,  9,  9, 11,  9,  9,  9,  9, 11],
        [ 2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4],
        [ 2, 12,  3,  6,  6,  3, 18,  6,  3, 18,  3,  3,  3, 18,  3]])


In [None]:
"".join([idx2token[i] for i in pred_tokens[3].numpy()][1:])

'4#++#H+#H###H#'

In [None]:
"".join([idx2token[i] for i in x[3].numpy()][1:])

')\\=/12/=<END>][2NO'

How to handle the data:

- Convert data to graph repr
- Take the SMILES as a vector
- Take the target property as a vector

- Combine the target variable with teh graph reprs
- Run through GCN
- Run throught GRU

In [65]:
from torch.utils.data import Dataset

class GraphData(Dataset):
    """
    Class which creates a custom dataset where each datapoint is a molecule/graph with a node matrix, edge matrix, HOMO-LUMO gap and the smiles representation
    """
    def __init__(self, dataset_path: str, node_vec_len: int, max_atoms: int):
        # Save attributes
        self.node_vec_len = node_vec_len
        self.max_atoms = max_atoms

        # Open dataset file
        df = pd.read_pickle(dataset_path)

        # Create lists
        self.indices = df.index.to_list()[:1]
        self.smiles = df["SMILES"].to_list()[:1]
        self.outputs = df["gaps"].to_list()[:1]

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, i: int):
        # Get smile
        smile = self.smiles[i]

        # Create MolGraph object using the Graph abstraction
        mol = Graph(smile, self.node_vec_len, self.max_atoms)

        # Get node and adjacency matrices
        node_mat = torch.Tensor(mol.node_mat)
        adj_mat = torch.Tensor(mol.adj_mat)

        # Get output
        output = torch.Tensor([self.outputs[i]])

        return (node_mat, adj_mat), output, smile

In [66]:
#### Fix seeds
np.random.seed(0)
torch.manual_seed(0)
use_GPU = torch.cuda.is_available()

#### Inputs
max_atoms = 50
node_vec_len = 30
train_size = 0.7
batch_size = 1
hidden_nodes = 30
n_conv_layers = 2
n_hidden_layers = 2
learning_rate = 0.01
n_epochs = 10

In [82]:
#### Start by creating dataset
# main_path = Path.cwd().parents[0]
data_path = "data/RDKit/rdkit_only_valid_smiles_qm9.pkl"
dataset = GraphData(dataset_path=data_path, max_atoms=max_atoms, 
                        node_vec_len=node_vec_len)[0]

In [83]:
(node_tensor, adj_tensor), gap, smiles = dataset

In [84]:
model = ChemGCN(node_vec_len=node_vec_len, node_fea_len=hidden_nodes,
                hidden_fea_len=hidden_nodes, n_conv=n_conv_layers, 
                n_hidden=n_hidden_layers, n_outputs=1, p_dropout=0.1)

In [92]:
# Standardizer
outputs = [dataset[1] for i in range(len(dataset))]
standardizer = Standardizer(torch.Tensor(outputs))

# Optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Loss function
loss_fn = torch.nn.MSELoss()

# Unpack data
node_mat = dataset[0][0]
adj_mat = dataset[0][1]
output = dataset[1]
smiles = dataset[-1]

print(output)

# Reshape inputs
first_dim = int((torch.numel(node_mat)) / (max_atoms * node_vec_len))
node_mat = node_mat.reshape(first_dim, max_atoms, node_vec_len)
adj_mat = adj_mat.reshape(first_dim, max_atoms, max_atoms)

# Standardize output
output_std = standardizer.standardize(output)

nn_input = (node_mat, adj_mat, output)
nn_output = output_std

nn_prediction = model(*nn_input)

nn_prediction.shape

tensor([13.7363])


TypeError: ChemGCN.forward() takes 3 positional arguments but 4 were given

In [None]:
def train_model(
    epoch,
    model,
    training_dataloader,
    optimizer,
    loss_fn,
    standardizer,
    use_GPU,
    max_atoms,
    node_vec_len,
):
    """
    Custom function which defines how a model will be trained (per epoch), here the mean-squared loss between prediction and actual value is used as evaluation metric. This function will perform backpropagation which updates the weights of the networks based in this evaluation.
    """
    # Create variables to store losses and error
    avg_loss = 0
    avg_mae = 0
    count = 0

    # Switch model to train mode
    model.train()

    # Go over each batch in the dataloader
    for i, dataset in enumerate(training_dataloader):
        # Unpack data
        node_mat = dataset[0][0]
        adj_mat = dataset[0][1]
        output = dataset[1]

        # Reshape inputs
        first_dim = int((torch.numel(node_mat)) / (max_atoms * node_vec_len))
        node_mat = node_mat.reshape(first_dim, max_atoms, node_vec_len)
        adj_mat = adj_mat.reshape(first_dim, max_atoms, max_atoms)

        # Standardize output
        output_std = standardizer.standardize(output)

        # Package inputs and outputs; check if GPU is enabled
        if use_GPU:
            nn_input = (node_mat.cuda(), adj_mat.cuda())
            nn_output = output_std.cuda()
        else:
            nn_input = (node_mat, adj_mat)
            nn_output = output_std

        # Compute output from network
        nn_prediction = model(*nn_input)

        # Calculate loss
        loss = loss_fn(nn_output, nn_prediction)
        avg_loss += loss

        # Calculate MAE
        prediction = standardizer.restore(nn_prediction.detach().cpu())
        mae = mean_absolute_error(output, prediction)
        avg_mae += mae

        # Set zero gradients for all tensors
        optimizer.zero_grad()

        # Do backward prop
        loss.backward()

        # Update optimizer parameters
        optimizer.step()

        # Increase count
        count += 1

    # Calculate avg loss and MAE
    avg_loss = avg_loss / count
    avg_mae = avg_mae / count

    # Print stats
    print(
        "Epoch: [{0}]\tTraining Loss: [{1:.2f}]\tTraining MAE: [{2:.2f}]"\
           .format(
                    epoch, avg_loss, avg_mae
           )
    )

    # Return loss and MAE
    return avg_loss, avg_mae