Step 1: Install Dependencies
Before starting, you need to install the required dependencies, including PyTorch, PyTorch Geometric, and RDKit for molecular processing.

In [1]:
!pip install torch==2.0.0+cu118 torchvision torchaudio -f https://download.pytorch.org/whl/cu118/torch_stable.html
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install torch-geometric
!pip install rdkit tqdm


Looking in links: https://download.pytorch.org/whl/cu118/torch_stable.html
Collecting torch==2.0.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.0.0 (from torch==2.0.0+cu118)
  Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.0 kB)
Collecting lit (from triton==2.0.0->torch==2.0.0+cu118)
  Downloading lit-18.1.8-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Downloading torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
  Downloading torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
  Downloading https://download.pytorch.org/whl/cu

Chapter 1: Exploring the QM9 Dataset and Basic GNN Implementation
In this chapter, we'll load and preprocess the QM9 dataset, explore chemical structures, and implement a basic GNN model to predict properties of molecules.

Code for Loading and Exploring the QM9 Dataset

In [2]:
from torch_geometric.datasets import QM9
from torch_geometric.data import DataLoader
from rdkit import Chem
from rdkit.Chem import Draw

# Load the QM9 dataset
def load_data(batch_size=32, num_samples=10000):
    dataset = QM9(root='data/QM9')
    # Selecting a smaller subset for faster execution
    small_dataset = dataset[:num_samples]
    dataloader = DataLoader(small_dataset, batch_size=batch_size, shuffle=True)
    return dataloader

# Visualize a molecule
def visualize_molecule(mol_data):
    smiles = mol_data['smiles']
    mol = Chem.MolFromSmiles(smiles)
    return Draw.MolToImage(mol)

dataloader = load_data()
# Visualize a sample molecule
sample = dataloader.dataset[0]
img = visualize_molecule(sample)
img.show()


Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting data/QM9/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404
Processing...
100%|██████████| 133885/133885 [02:08<00:00, 1039.49it/s]
Done!


Basic GNN Model

In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNN(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, out_feats):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_feats, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, out_feats)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        return self.fc(x)


Chapter 2: GNN-VAE Implementation
Now we’ll extend the GNN to a Variational Autoencoder (VAE) to generate molecules.

GNN-VAE Model

In [4]:
class GNNVAE(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, latent_dim):
        super(GNNVAE, self).__init__()
        self.encoder = GNN(in_feats, hidden_dim, latent_dim)
        self.fc_mu = torch.nn.Linear(latent_dim, latent_dim)
        self.fc_logvar = torch.nn.Linear(latent_dim, latent_dim)
        self.decoder = torch.nn.Linear(latent_dim, in_feats)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps * std + mu

    def forward(self, data):
        mu, logvar = self.fc_mu(data), self.fc_logvar(data)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar

# Loss function for VAE
def vae_loss(recon_x, x, mu, logvar):
    recon_loss = F.mse_loss(recon_x, x)
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kld_loss


Training Loop for GNN-VAE

In [5]:
import torch.optim as optim

def train_gnn_vae(vae_model, dataloader, epochs=5):
    optimizer = optim.Adam(vae_model.parameters(), lr=1e-3)
    for epoch in range(epochs):
        vae_model.train()
        total_loss = 0
        for batch in dataloader:
            batch = batch.to('cuda')  # Ensure batch is on GPU
            recon, mu, logvar = vae_model(batch)
            loss = vae_loss(recon, batch.x, mu, logvar)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, VAE Loss: {total_loss:.4f}")


Chapter 3: GNN-GAN Implementation
This chapter focuses on using GANs to generate new molecules.

GNN-GAN Generator and Discriminator

In [6]:
class GNNGANGenerator(torch.nn.Module):
    def __init__(self, latent_dim, out_feats):
        super(GNNGANGenerator, self).__init__()
        self.fc = torch.nn.Linear(latent_dim, out_feats)

    def forward(self, z):
        return self.fc(z)

class GNNGANDiscriminator(torch.nn.Module):
    def __init__(self, in_feats):
        super(GNNGANDiscriminator, self).__init__()
        self.fc = torch.nn.Linear(in_feats, 1)

    def forward(self, features):
        return torch.sigmoid(self.fc(features))

# Loss for GAN
def gan_loss(real_pred, fake_pred):
    real_loss = F.binary_cross_entropy(real_pred, torch.ones_like(real_pred))
    fake_loss = F.binary_cross_entropy(fake_pred, torch.zeros_like(fake_pred))
    return real_loss + fake_loss


Training Loop for GNN-GAN

In [7]:
def train_gnn_gan(generator, discriminator, dataloader, epochs=5):
    g_optimizer = optim.Adam(generator.parameters(), lr=1e-3)
    d_optimizer = optim.Adam(discriminator.parameters(), lr=1e-3)
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            batch = batch.to('cuda')
            z = torch.randn(batch.x.shape[0], latent_dim)
            fake_features = generator(z)
            real_pred = discriminator(batch.x)
            fake_pred = discriminator(fake_features.detach())

            d_loss = gan_loss(real_pred, fake_pred)
            d_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

            fake_pred = discriminator(fake_features)
            g_loss = F.binary_cross_entropy(fake_pred, torch.ones_like(fake_pred))
            g_optimizer.zero_grad()
            g_loss.backward()
            g_optimizer.step()

            total_loss += g_loss.item() + d_loss.item()
        print(f"Epoch {epoch+1}/{epochs}, GAN Loss: {total_loss:.4f}")


Chapter 4: GNN-GARAE Implementation
This chapter explores AutoRegressive models for molecule generation.

GNN-GARAE Model

In [8]:
class GNNGARAE(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, latent_dim):
        super(GNNGARAE, self).__init__()
        self.encoder = GNN(in_feats, hidden_dim, latent_dim)
        self.decoder = torch.nn.LSTM(latent_dim, latent_dim, batch_first=True)
        self.fc = torch.nn.Linear(latent_dim, in_feats)

    def forward(self, data):
        latent = self.encoder(data)
        latent_seq, _ = self.decoder(latent.unsqueeze(1))
        return self.fc(latent_seq.squeeze(1))


Training Loop for GNN-GARAE

In [9]:
def train_gnn_garae(garae_model, dataloader, epochs=5):
    optimizer = optim.Adam(garae_model.parameters(), lr=1e-3)
    for epoch in range(epochs):
        garae_model.train()
        total_loss = 0
        for batch in dataloader:
            batch = batch.to('cuda')
            recon = garae_model(batch)
            loss = F.mse_loss(recon, batch.x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, GARAE Loss: {total_loss:.4f}")


Evaluation Metrics: Validity, Novelty, Uniqueness
You can use RDKit for evaluating the generated molecules.

In [10]:
from rdkit import Chem

def evaluate_molecules(molecules):
    valid_count = 0
    for mol in molecules:
        if Chem.MolFromSmiles(mol):  # Check if SMILES is valid
            valid_count += 1
    validity = valid_count / len(molecules)
    print(f"Validity: {validity * 100:.2f}%")

# Example list of generated SMILES strings
generated_molecules = ["CCO", "CCC", "C1=CC=CC=C1", "CCN", "CCOCC"]
evaluate_molecules(generated_molecules)


Validity: 100.00%
