In [2]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
from pathlib import Path
import pandas as pd
import numpy as np
import random
import torch
from sklearn.model_selection import StratifiedKFold
from rdkit import Chem

from library.cVAE_helper import (
    get_vocab,
    train_model,
    test_model,
    loss_function,
    make_stratified_bins,
)

from library.GCN import ConvolutionLayer, PoolingLayer, GraphData, collate_graph_dataset, Standardizer, Graph
from library.cVAE import GCN_Encoder, GRU_Decoder, cVAE
from torch.utils.data import DataLoader,SubsetRandomSampler, Subset

In [3]:
# Load QM9 SMILES
df_qm9 = pd.read_pickle('../data/RDKit/rdkit_only_valid_smiles_qm9.pkl')
smiles_list = df_qm9["SMILES"].to_list()

vocab_list, vocab_size = get_vocab(smiles_list)

# Create token2index mapping and its inverse
token2idx = {tok: idx for idx, tok in enumerate(vocab_list)}
idx2token = {idx: tok for tok, idx in token2idx.items()}

print("Vocabulary size:", vocab_size)
print("Example tokens:", vocab_list)

Vocabulary size: 24
Example tokens: ['<PAD>', '<END>', '<STR>', '#', '(', ')', '+', '-', '/', '1', '2', '3', '4', '5', '=', '@', 'C', 'F', 'H', 'N', 'O', '[', '\\', ']']


In [4]:
#### Inputs
n_epochs = 15
batch_size = 1000
train_size = 0.7
learning_rate = 0.01
device = "cpu"

# GCN
max_atoms = 30 # fixed value
node_vec_len = 16 # fixed value
n_hidden = 32
n_conv_layers = 2
n_hidden_layers = 2

# GRU
latent_dim = 16
# latent_dim = 24
gru_dim = 16
embedding_dim = 12
n_gru_layers = 2 # stacked GRUs
n_fc_layers = 3 # 3 dense layers after GRU

# cVAE
gcn_hidden_nodes = n_hidden + 1
teacher_forcing_ratio = 0.5

In [5]:
#### Start by creating dataset
main_path = Path.cwd().parents[0]
data_path = main_path / "data" / "RDKit" / "rdkit_only_valid_smiles_qm9.pkl"
dataset = GraphData(dataset_path=data_path, max_atoms=max_atoms, node_vec_len=node_vec_len)

gaps = dataset.outputs

#### Split data into training and test sets
# Get train and test sizes
dataset_indices = np.arange(0, len(dataset), 1)
train_size = int(np.round(train_size * len(dataset)))
test_size = len(dataset) - train_size

# Randomly sample train and test indices
train_indices = np.random.choice(dataset_indices, size=train_size, 
                                                            replace=False)
test_indices = np.array(list(set(dataset_indices) - set(train_indices)))

# Create dataoaders
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = DataLoader(dataset, batch_size=batch_size, 
                          sampler=train_sampler, 
                          collate_fn=collate_graph_dataset)
test_loader = DataLoader(dataset, batch_size=batch_size, 
                         sampler=test_sampler,
                         collate_fn=collate_graph_dataset)

In [6]:
# for dataset in train_loader:
#     print(*dataset[1])
#     print(dir(dataset))
#     break

In [7]:
# dir(train_loader)

In [8]:
encoder = GCN_Encoder(
    node_vec_len=node_vec_len,
    node_fea_len=n_hidden,
    hidden_fea_len=n_hidden,
    n_conv=n_conv_layers,
    n_hidden=n_hidden_layers,
    n_outputs=1,
    p_dropout=0.1
)

decoder = GRU_Decoder(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    latent_dim=latent_dim,
    hidden_size=gru_dim,
    n_gru_layers=n_gru_layers,
    n_fc_layers=n_fc_layers
).to(device)

model = cVAE(
    encoder=encoder,
    decoder=decoder,
    device=device,
    n_gcn_hidden_dim=gcn_hidden_nodes,
    n_gru_hidden_dim=gru_dim,
    latent_dim=latent_dim,
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    teacher_forcing_ratio=teacher_forcing_ratio
).to(device)

In [9]:
# Standardizer
# smiles = [dataset[i][2] for i in range(len(dataset))]

# standardizer = Standardizer(torch.Tensor(outputs))

In [10]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
use_GPU = False

#### Train the model
train_loss = []
train_accuracy = []
epoch = []
# for i in range(n_epochs):
#     epoch_loss, epoch_accuarcy = train_model(
#         i,
#         model,
#         train_loader,
#         optimizer,
#         loss_function,
#         use_GPU,
#         'cpu',
#         max_atoms,
#         node_vec_len,
#         token2idx
#     )
#     train_loss.append(epoch_loss)
#     epoch.append(i)
#     train_accuracy.append(epoch_accuarcy)

In [12]:
# test_loss, test_accuracy = test_model(model, test_loader, loss_function, use_GPU, 'cpu', max_atoms, node_vec_len, token2idx)

In [13]:
#### Print final results
# print(f"Training Loss: {train_loss[-1]:.2f}")
# print(f"Training accuracy: {train_accuracy[-1]:.2f}")
# print(f"Test Loss: {test_loss:.2f}")
# print(f"Test accuracy: {test_accuracy:.2f}")

Need to add:
- a form of accuracy
- CV for hyperparameter tuning

In [14]:
outer_binned_gaps = make_stratified_bins(gaps)

n_outer_folds = 2
outer_cv = StratifiedKFold(n_splits=n_outer_folds, shuffle=True, random_state=42)

for fold, (outer_train_idx, outer_test_idx) in enumerate(outer_cv.split(dataset_indices, outer_binned_gaps)):
    print(f"\n=== OUTER FOLD {fold+1}/{n_outer_folds} ===")

    outer_train_indices = np.array(dataset_indices)[outer_train_idx].tolist()
    outer_test_indices = np.array(dataset_indices)[outer_test_idx].tolist()
    outer_train_dataset = Subset(dataset, outer_train_indices)

    # Final train on full outer_train
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(n_epochs):
        train_model(
        epoch,
        model,
        train_loader,
        optimizer,
        loss_function,
        use_GPU,
        'cpu',
        max_atoms,
        node_vec_len,
        token2idx
        )
    
    # Outer eval
    print("  Evaluating on Outer Test...")
    test_loss, test_acc = test_model(model, test_loader, loss_function, use_GPU, 'cpu', max_atoms, node_vec_len, token2idx)
    
    print(f"  Outer Test Loss: {test_loss:.4f}, Acc: {test_acc:.4f}")
    # print(test_loss, test_acc)


=== OUTER FOLD 1/2 ===
Epoch: [0]	Training Loss: [2.57]	Reconstruction accuracy: [0.17745903335914623]
Epoch: [1]	Training Loss: [1.75]	Reconstruction accuracy: [0.4373643728714878]
Epoch: [2]	Training Loss: [1.33]	Reconstruction accuracy: [0.618074150235827]
Epoch: [3]	Training Loss: [1.14]	Reconstruction accuracy: [0.6691896464196659]
Epoch: [4]	Training Loss: [1.07]	Reconstruction accuracy: [0.6893828753991657]
Epoch: [5]	Training Loss: [1.04]	Reconstruction accuracy: [0.6939915320256226]
Epoch: [6]	Training Loss: [1.04]	Reconstruction accuracy: [0.6947332921957228]
Epoch: [7]	Training Loss: [1.02]	Reconstruction accuracy: [0.6990567183808207]
Epoch: [8]	Training Loss: [1.01]	Reconstruction accuracy: [0.699027218978969]
Epoch: [9]	Training Loss: [0.99]	Reconstruction accuracy: [0.7066509924202322]
Epoch: [10]	Training Loss: [0.99]	Reconstruction accuracy: [0.7048997028717651]
Epoch: [11]	Training Loss: [0.96]	Reconstruction accuracy: [0.7123092992336275]
Epoch: [12]	Training Loss: 

In [15]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
encoder.init_transform.weight 	 torch.Size([32, 16])
encoder.init_transform.bias 	 torch.Size([32])
encoder.conv_layers.0.conv_linear.weight 	 torch.Size([32, 32])
encoder.conv_layers.0.conv_linear.bias 	 torch.Size([32])
encoder.conv_layers.1.conv_linear.weight 	 torch.Size([32, 32])
encoder.conv_layers.1.conv_linear.bias 	 torch.Size([32])
encoder.pooled_to_hidden.weight 	 torch.Size([32, 32])
encoder.pooled_to_hidden.bias 	 torch.Size([32])
encoder.hidden_layer.weight 	 torch.Size([32, 32])
encoder.hidden_layer.bias 	 torch.Size([32])
encoder.hidden_layers.0.weight 	 torch.Size([32, 32])
encoder.hidden_layers.0.bias 	 torch.Size([32])
encoder.hidden_to_output.weight 	 torch.Size([1, 32])
encoder.hidden_to_output.bias 	 torch.Size([1])
decoder.embedding.weight 	 torch.Size([24, 12])
decoder.gru.weight_ih_l0 	 torch.Size([48, 29])
decoder.gru.weight_hh_l0 	 torch.Size([48, 16])
decoder.gru.bias_ih_l0 	 torch.Size([48])
decoder.gru.bias_hh_l0 	 torch.Size([48])
deco

In [16]:
PATH = Path.cwd().parents[0] / 'data' / 'cvae_model' / 'model_10_12.pt'
torch.save(model.state_dict(), PATH)

In [None]:
# model.load_state_dict(torch.load(PATH, weights_only=True))