In [1]:
import dgl
import torch
import numpy as np

import yaml
from pydantic import BaseModel

from indigo import *


Using backend: pytorch


In [2]:
indigo = Indigo()
indigo.setOption("ignore-stereochemistry-errors", True)
indigo.setOption("ignore-bad-valence", True)

In [3]:
import torch.nn as nn
import torch.nn.functional as F
from dgl.data import DGLDataset
from dgl.data.utils import split_dataset
from dgl.dataloading import GraphDataLoader
from dgl.nn.pytorch import Set2Set, NNConv
from tqdm.auto import tqdm
from sklearn.metrics import r2_score
import pandas as pd
from indigo.ml.mpp.preprocess import mol_to_graph


**GCN for property prediction**

In [None]:
file_name = "Adrenergic_dataset.csv"
target =  "logP" # "AdrA1A_PCHEMBL_VALUE"
smiles = "Structure"

df = pd.read_csv(file_name)
df = df.loc[df[target].notnull()]
data = dict(zip(df[smiles], df[target]))

*Dataset*

In [23]:
class MolDataset(DGLDataset):
    def __init__(self, data):
        self.data = data
        super().__init__(name='mols')
        
    def process(self):
        self.graphs = []
        self.labels = []
        for smiles, label in self.data.items():
            self.graphs.append(mol_to_graph(smiles))
            self.labels.append(label)

        self.gclasses = len(self.labels)
        self.dim_nfeats = len(self.graphs[0].ndata['atomic'][0])
        self.dim_efeats = len(self.graphs[0].edata['ord'][0])
        self.labels = torch.FloatTensor(self.labels)
        
        
        
    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]
    
    def __len__(self):
        return len(self.graphs)


*Dataloaders*

In [4]:
def load_data(dataset):
    train_set, val_set, test_set = split_dataset(dataset, frac_list=None, shuffle=False, random_state=None)
    train_loader = GraphDataLoader(dataset=train_set, shuffle=True, drop_last=False)
    val_loader = GraphDataLoader(dataset=val_set, shuffle=True, drop_last=False)
    test_loader = GraphDataLoader(dataset=test_set, shuffle=True, drop_last=False)
    return train_loader, val_loader, test_loader


*Model*

In [5]:
class MPNNGNN(nn.Module):

    def __init__(self, node_in_feats, edge_in_feats, node_out_feats=64,
                 edge_hidden_feats=128, num_step_message_passing=6):
        super(MPNNGNN, self).__init__()

        self.project_node_feats = nn.Sequential(
            nn.Linear(node_in_feats, node_out_feats),
            nn.ReLU()
        )
        self.num_step_message_passing = num_step_message_passing
        edge_network = nn.Sequential(
            nn.Linear(edge_in_feats, edge_hidden_feats),
            nn.ReLU(),
            nn.Linear(edge_hidden_feats, node_out_feats * node_out_feats)
        )
        self.gnn_layer = NNConv(
            in_feats=node_out_feats,
            out_feats=node_out_feats,
            edge_func=edge_network,
            aggregator_type='sum'
        )
        self.gru = nn.GRU(node_out_feats, node_out_feats)

    def forward(self, g, node_feats, edge_feats):
        node_feats = self.project_node_feats(node_feats) 
        hidden_feats = node_feats.unsqueeze(0)           

        for _ in range(self.num_step_message_passing):
            node_feats = F.relu(self.gnn_layer(g, node_feats, edge_feats))
            node_feats, hidden_feats = self.gru(node_feats.unsqueeze(0), hidden_feats)
            node_feats = node_feats.squeeze(0)

        return node_feats

In [6]:
class MLPRegressor(nn.Module):

    def __init__(self, in_feats, hidden_feats, n_tasks, dropout=0.):
        super(MLPRegressor, self).__init__()

        self.predict = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(in_feats, hidden_feats),
            nn.ReLU(),
            nn.Linear(hidden_feats, n_tasks)
        )

    def forward(self, h):
        return self.predict(h)

In [7]:
class MPNNRegressor(nn.Module):

    def __init__(self, in_node_feats, in_edge_feats, node_hidden_dim, edge_hidden_dim,
                 num_step_message_passing, num_step_set2set, num_layer_set2set, n_tasks,
                 regressor_hidden_feats=128, dropout=0.):
        super(MPNNRegressor, self).__init__()
        self.gnn = MPNNGNN(in_node_feats, in_edge_feats, node_hidden_dim,
                           edge_hidden_dim, num_step_message_passing)
        self.readout = Set2Set(node_hidden_dim, num_step_set2set, num_layer_set2set)
        readout_feats = 2 * node_hidden_dim
        self.regressor = MLPRegressor(readout_feats, regressor_hidden_feats, n_tasks, dropout)


    def forward(self, bg, node_feats, edge_feats):

        feats = self.gnn(bg, node_feats, edge_feats)
        h_g = self.readout(bg, feats)

        return self.regressor(h_g)

*Parameters*

In [None]:
{
  "lr": 3e-4,
  "weight_decay": 0,
  "patience": 30,
  "batch_size": 128,
  "node_out_feats": 64,
  "edge_hidden_feats": 128,
  "num_step_message_passing": 6,
  "num_step_set2set": 6,
  "num_layer_set2set": 3
}


In [None]:
MPNN_params = {
    'node_hidden_dim': 64,
    'edge_hidden_dim': 16,
    'num_step_message_passing': 2,
    'num_step_set2set': 3,
    'num_layer_set2set': 2,
    'regressor_hidden_feats': 32,
    'dropout': 0.,
    'n_tasks': 1
}

dataset = MolDataset(data)
train_loader, val_loader, test_loader = load_data(dataset)
num_epoch = 20
model = MPNNRegressor(dataset.dim_nfeats, dataset.dim_efeats, **MPNN_params)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fcn = torch.nn.SmoothL1Loss()

*Training*

In [None]:
for epoch in tqdm(range(num_epoch)):
    losses = list()
    for batched_graph, labels in train_loader:

        node_feats = batched_graph.ndata['atomic'].float()
        edge_feats = batched_graph.edata['ord'].float()
        prediction = model(batched_graph, node_feats, edge_feats)
        loss = loss_fcn(prediction, labels)
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print('Epoch: {}/{}.............'.format(epoch, num_epoch), end=' ')
    print("Loss: {:.4f}".format(loss.mean()))
   

In [None]:
from sklearn.metrics import mean_absolute_error

def evaluate(model, loader):
   
    preds = []
    labels = []
    model.eval()
    
    with torch.no_grad():
        for batched_graph, label in loader:
           
            node_feats = batched_graph.ndata['atomic'].float()
            edge_feats = batched_graph.edata['ord'].float()
            prediction = model(batched_graph, node_feats, edge_feats)
            preds.append(float(prediction))
            labels.append(float(label))

        print(f'R2 score: {r2_score(labels, preds):.2f}')
        print(f'MAE: {mean_absolute_error(labels, preds):.2f}')

    model.train()

In [None]:
evaluate(model, val_loader)

*GMPP main training*

In [14]:
class GMPPConfig(BaseModel):
    """A dataclass storing Graph Molecular Property Prediction model configuration parameters.

    Attributes:
        node_hidden_dim: 
        edge_hidden_dim:
        num_step_message_passing: 
        num_step_set2set: 
        num_layer_set2set: 
        regressor_hidden_feats:
        dropout:
        n_tasks: 
        num_epoch:
    """
    node_hidden_dim: int
    edge_hidden_dim: int
    num_step_message_passing: int
    num_step_set2set: int
    num_layer_set2set: int
    regressor_hidden_feats: int
    dropout: int
    n_tasks: int
    num_epoch: int
    

In [15]:
def load_config(file_name: str, config_class: BaseModel):
    with open(file_name, "r") as f:
        data = yaml.safe_load(f)
        return config_class(**data)

In [28]:
def main(
    dataset_filename: str,  # or file path?
    smiles_column: str,
    target_column: str,
    config_filename: str
):
    df = pd.read_csv(dataset_filename)
    df = df.loc[df[target_column].notnull()]
    data = dict(zip(df[smiles_column], df[target_column]))

    config = load_config(config_filename, GMPPConfig)
    dataset = MolDataset(data)
    train_loader, val_loader, test_loader = load_data(dataset)
    
    model = MPNNRegressor(dataset.dim_nfeats, dataset.dim_efeats, **config.dict())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fcn = torch.nn.SmoothL1Loss()
    
    for epoch in tqdm(range(config.num_epoch)):
        losses = list()
        for batched_graph, labels in train_loader:
            node_feats = batched_graph.ndata['atomic'].float()
            edge_feats = batched_graph.edata['ord'].float()
            prediction = model(batched_graph, node_feats, edge_feats)
            loss = loss_fcn(prediction, labels)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print('Epoch: {}/{}.............'.format(epoch, num_epoch), end=' ')
    print("Loss: {:.4f}".format(loss.mean()))
    

In [None]:
main("Adrenergic_dataset.csv", "Structure", "logP", "MPNN.yml")