In [1]:
import numpy as np
import torch
from library.GCN import *
from pathlib import Path
from torch.utils.data import DataLoader,SubsetRandomSampler

### Chemical accuracy is 0.043 eV, so MAE should be lower than this for the model to be chemically accurate

In [2]:
#### Fix seeds
np.random.seed(0)
torch.manual_seed(0)
use_GPU = torch.cuda.is_available()

#### Inputs
max_atoms = 30 # fixed value
node_vec_len = 16 # fixed value
train_size = 0.7
batch_size = 1000
hidden_nodes = 30
n_conv_layers = 2
n_hidden_layers = 2
learning_rate = 0.01
n_epochs = 10

#### Start by creating dataset
main_path = Path.cwd().parents[0]
data_path = main_path / "data" / "RDKit" / "rdkit_only_valid_smiles_qm9.pkl"
dataset = GraphData(dataset_path=data_path, max_atoms=max_atoms, 
                        node_vec_len=node_vec_len)

#### Split data into training and test sets
# Get train and test sizes
dataset_indices = np.arange(0, len(dataset), 1)
train_size = int(np.round(train_size * len(dataset)))
test_size = len(dataset) - train_size

# Randomly sample train and test indices
train_indices = np.random.choice(dataset_indices, size=train_size, 
                                                            replace=False)
test_indices = np.array(list(set(dataset_indices) - set(train_indices)))

# Create dataoaders
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = DataLoader(dataset, batch_size=batch_size, 
                          sampler=train_sampler, 
                          collate_fn=collate_graph_dataset)
test_loader = DataLoader(dataset, batch_size=batch_size, 
                         sampler=test_sampler,
                         collate_fn=collate_graph_dataset)

#### Initialize model, standardizer, optimizer, and loss function
# Model
model = ChemGCN(node_vec_len=node_vec_len, node_fea_len=hidden_nodes,
                hidden_fea_len=hidden_nodes, n_conv=n_conv_layers, 
                n_hidden=n_hidden_layers, n_outputs=1, p_dropout=0.1)
# Transfer to GPU if needed
if use_GPU:
    model.cuda()

# Standardizer
outputs = [dataset[i][1] for i in range(len(dataset))]
standardizer = Standardizer(torch.Tensor(outputs))

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Loss function
loss_fn = torch.nn.MSELoss()

#### Train the model
loss = []
mae = []
epoch = []
for i in range(n_epochs):
    epoch_loss, epoch_mae = train_model(
        i,
        model,
        train_loader,
        optimizer,
        loss_fn,
        standardizer,
        use_GPU,
        max_atoms,
        node_vec_len,
    )
    loss.append(epoch_loss)
    mae.append(epoch_mae)
    epoch.append(i)

#### Test the model
# Call test model function
test_loss, test_mae = test_model(model, test_loader, loss_fn, standardizer,
                                 use_GPU, max_atoms, node_vec_len)

#### Print final results
print(f"Training Loss: {loss[-1]:.2f}")
print(f"Training MAE: {mae[-1]:.2f}")
print(f"Test Loss: {test_loss:.2f}")
print(f"Test MAE: {test_mae:.2f}")

Epoch: [0]	Training Loss: [0.48]	Training MAE: [0.69]
Epoch: [1]	Training Loss: [0.33]	Training MAE: [0.56]
Epoch: [2]	Training Loss: [0.28]	Training MAE: [0.51]
Epoch: [3]	Training Loss: [0.21]	Training MAE: [0.44]
Epoch: [4]	Training Loss: [0.17]	Training MAE: [0.40]
Epoch: [5]	Training Loss: [0.15]	Training MAE: [0.37]
Epoch: [6]	Training Loss: [0.13]	Training MAE: [0.35]
Epoch: [7]	Training Loss: [0.13]	Training MAE: [0.34]
Epoch: [8]	Training Loss: [0.12]	Training MAE: [0.33]
Epoch: [9]	Training Loss: [0.12]	Training MAE: [0.32]
Training Loss: 0.12
Training MAE: 0.32
Test Loss: 0.10
Test MAE: 0.30


In [3]:
# # Defining the model
# import torch
# from torch_geometric.nn import GCNConv, global_mean_pool

# class GCN(torch.nn.Module):
#     def __init__(self, num_features, hidden_channels):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(num_features, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.lin = torch.nn.Linear(hidden_channels, 1)

#     def forward(self, x, edge_index, batch):
#         x = self.conv1(x, edge_index).relu()
#         x = self.conv2(x, edge_index).relu()
#         x = global_mean_pool(x, batch)
#         return self.lin(x)


In [4]:
# # Training
# from torch_geometric.loader import DataLoader
# from sklearn.model_selection import train_test_split
# import pandas as pd

# # Load data
# main_path = Path.cwd().parents[0]
# data_path = main_path / "data" / "RDKit" / "valid_smiles_graphs.pkl"
# df = pd.read_pickle(data_path)
# dataset = np.array(df['graphs'])

# train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
# train_loader = DataLoader(train_data, batch_size=1000, shuffle=True)
# test_loader = DataLoader(test_data, batch_size=1000)

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = GCN(num_features=1, hidden_channels=64).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# loss_fn = torch.nn.MSELoss()

# for epoch in range(10):
#     model.train()
#     total_loss = 0
#     for batch in train_loader:
#         batch = batch.to(device)
#         optimizer.zero_grad()
#         out = model(batch.x.float(), batch.edge_index, batch.batch)
#         loss = loss_fn(out, batch.y)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


In [5]:
# # Evaluation
# model.eval()
# preds, trues = [], []
# with torch.no_grad():
#     for batch in test_loader:
#         batch = batch.to(device)
#         out = model(batch.x.float(), batch.edge_index, batch.batch)
#         preds.append(out.cpu())
#         trues.append(batch.y.cpu())

# import torch
# preds = torch.cat(preds)
# trues = torch.cat(trues)

# from sklearn.metrics import mean_absolute_error, r2_score
# print("MAE:", mean_absolute_error(trues, preds))
# print("RÂ²:", r2_score(trues, preds))
