In [None]:
### import statements
! pip install chemprop
import random
import numpy as np
import pandas as pd
! pip install pytorch-lightning wandb rdkit ogb deepchem
import torch
VERSION = torch.__version__
! pip install pyg_lib torch_scatter torch_sparse -f https://data.pyg.org/whl/torch-{VERSION}.html
! pip install torch-geometric
! mkdir data/
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG = True  # < use SVGs instead of PNGs
IPythonConsole.drawOptions.addAtomIndices = True  # adding indices for atoms
IPythonConsole.drawOptions.addBondIndices = False  # not adding indices for bonds
IPythonConsole.molSize = 200, 200
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)
! pip install deepchem.data
import torch.nn.functional as F
from torch.nn import GRU
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv, MLP, global_add_pool
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
from tqdm import tqdm
import pandas as pd
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
)
from ogb.utils import smiles2graph
from deepchem.splits import RandomSplitter
import torch.nn as nn
from torch_geometric.nn import GCNConv, global_add_pool
from deepchem.feat import RDKitDescriptors
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import wandb

In [None]:
### downloading data and assigning key columns from the dataset
df = pd.read_csv('https://raw.githubusercontent.com/NinaB99/AI-for-Chemistry/main/Data/Bioavailibility.csv')
smiles = df['SMILES'].values.tolist()
y = df['logK(%F)'].values
category = df['Category'].values.tolist()

# normalising the bioavailability targets
mean = y.mean()
std = y.std()
y = (y - mean) / std
mean, std = mean.item(), std.item()

In [None]:
### classes for GNN and its MLP
class CustomMLP(nn.Module):
    '''a custom function for MLP that allows for dropout
    '''
    def __init__(self, layer_sizes, dropout_rate=0.5):
        super(CustomMLP, self).__init__()
        layers = []
        for i in range(len(layer_sizes) - 1):
            layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
            if i < len(layer_sizes) - 2:  # No activation or dropout on the output layer
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(dropout_rate))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

class Graph_NN(pl.LightningModule):
    '''neural network class. Hyperparameters and split data are inputs. Trained using the MSE.
    '''
    def __init__(self, hidden_dim, out_dim,
                 train_data, valid_data, test_data,
                 std, dropout_rate=0.5, batch_size=32, lr=1e-3):
        super().__init__()
        self.std = std  # std of data's target
        self.train_data = train_data
        self.valid_data = valid_data
        self.test_data = test_data
        self.batch_size = batch_size
        self.lr = lr
        # Initial layers
        self.atom_emb = AtomEncoder(emb_dim=hidden_dim)
        self.bond_emb = BondEncoder(emb_dim=hidden_dim)
        # Message passing layers
        nn = CustomMLP([hidden_dim, hidden_dim * 2, hidden_dim * hidden_dim], dropout_rate)
        self.conv = NNConv(hidden_dim, hidden_dim, nn, aggr='mean')
        self.gru = GRU(hidden_dim, hidden_dim)
        # Readout layers
        self.mlp = CustomMLP([hidden_dim, int(hidden_dim / 2), out_dim], dropout_rate)

    def forward(self, data, mode="train"):

        # Initialization
        x = self.atom_emb(data.x)
        h = x.unsqueeze(0)
        edge_attr = self.bond_emb(data.edge_attr)

        # Message passing
        for i in range(3):
            m = F.relu(self.conv(x, data.edge_index, edge_attr))  # send message and aggregation
            x, h = self.gru(m.unsqueeze(0), h)  # node update
            x = x.squeeze(0)

        # Readout
        x = global_add_pool(x, data.batch)
        x = self.mlp(x)

        return x.view(-1)

    def training_step(self, batch, batch_idx):
        # Here we define the train loop.
        out = self.forward(batch, mode="train")
        loss = F.mse_loss(out, batch.y)
        self.log("Train loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        # Define validation step. At the end of every epoch, this will be executed
        out = self.forward(batch, mode="valid")
        loss = F.mse_loss(out * self.std, batch.y * self.std)  # report MSE
        self.log("Valid MSE", loss)

    def test_step(self, batch, batch_idx):
        # What to do in test
        out = self.forward(batch, mode="test")
        loss = F.mse_loss(out * self.std, batch.y * self.std)  # report MSE
        self.log("Test MSE", loss)

    def configure_optimizers(self):
        # Here we configure the optimization algorithm.
        optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.lr
        )
        return optimizer

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.valid_data, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_data, batch_size=self.batch_size, shuffle=False)


In [None]:
### creating data class for the GNN
class csvGraphData(InMemoryDataset):
    """The solubility graph dataset using PyG
    """
    # bioavailability dataset download link
    raw_url = 'https://raw.githubusercontent.com/NinaB99/AI-for-Chemistry/main/Data/Bioavailibility.csv'

    def __init__(self, root, transform=None):
        super().__init__(root, transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['Bioavailibility.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        print('Downloading dataset...')
        file_path = download_url(self.raw_url, self.raw_dir)

    def process(self):
        # load raw data from a csv file
        print(self.raw_paths)
        df = pd.read_csv(self.raw_paths[0],sep=',')
        smiles = df['SMILES'].values.tolist()
        target = df['logK(%F)'].values.tolist()

        # Convert SMILES into graph data
        print('Converting SMILES strings into graphs...')
        data_list = []
        for i, smi in enumerate(tqdm(smiles)):

            # get graph data from SMILES
            graph = smiles2graph(smi)

            # convert to tensor and pyg data
            x = torch.tensor(graph['node_feat'], dtype=torch.long)
            edge_index = torch.tensor(graph['edge_index'], dtype=torch.long)
            edge_attr = torch.tensor(graph['edge_feat'], dtype=torch.long)
            y = torch.tensor([target[i]], dtype=torch.float)
            data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
            data_list.append(data)

        # save data
        torch.save(self.collate(data_list), self.processed_paths[0])

In [None]:
### preparing data and splitting
# create dataset
dataset = csvGraphData('./data_pyg').shuffle()

# Normalize target to mean = 0 and std = 1.
mean = dataset.data.y.mean()
std = dataset.data.y.std()
dataset.data.y = (dataset.data.y - mean) / std
mean, std = mean.item(), std.item()

split data
splitter = RandomSplitter()
train_idx, valid_idx, test_idx = splitter.split(dataset, frac_train=0.7, frac_valid=0.1, frac_test=0.2)
train_dataset_graph = dataset[train_idx]
valid_dataset_graph = dataset[valid_idx]
test_dataset_graph = dataset[test_idx]
# optional print statements for testing:
# print(f"Train dataset shape: {train_dataset_graph.shape}")
# print(f"Valid dataset shape: {valid_dataset_graph.shape}")
# print(f"Test dataset shape: {test_dataset_graph.shape}")

In [None]:
### running an instance of the GNN
wandb.init(project="gnn-bioavailibility",
           config={
               "batch_size": 24,
               "learning_rate": 0.001,
               "hidden_size": 64,
               "max_epochs": 150
           })

#creating instance of GNN and setting hyperparameters 
graph_branch = Graph_NN(
     hidden_dim = wandb.config["hidden_size"],
     out_dim = 1,
     std = std,
     train_data = train_dataset_graph,
     valid_data = valid_dataset_graph,
     test_data = test_dataset_graph,
     lr=wandb.config["learning_rate"],
     batch_size=wandb.config["batch_size"]
)
wandb_logger = WandbLogger()
trainer = pl.Trainer(
    max_epochs = wandb.config["max_epochs"],
    logger = wandb_logger
)

# Training the model
trainer.fit(
    model=graph_branch,
)

# Running test
graph_results = trainer.test(ckpt_path="best")
wandb.finish()
# Test RMSE
graph_test_mse = graph_results[0]["Test MSE"]
graph_test_rmse = graph_test_mse ** 0.5
print(f"\Graph neural network model performance: RMSE on test set = {graph_test_rmse:.4f}.\n")
# save this instance of the GNN
torch.save(graph_branch.state_dict(), 'graph_nn_model.pth')

In [None]:
### Evaluating the predictions of the GNN
# change mode of GNN
graph_branch.eval()

# Create a DataLoader for the dataset
test_loader = DataLoader(dataset, batch_size=24, shuffle=False)

# Perform the forward pass to get predictions
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        predictions = graph_branch(batch)
        test_predictions.append(predictions)

# Concatenate all predictions
test_predictions_tensor_graph = torch.cat(test_predictions)

# Convert predictions back to NumPy array
graph_predictions = test_predictions_tensor_graph.numpy().flatten()

# Create outputs DataFrame containing predictions from GNN and the categorised real values
outputs_df = pd.DataFrame({
    "Predictions": graph_predictions,
    "Category": category
})

outputs_df = outputs_df.reset_index(drop=True)
count_total = len(outputs_df)

# classifying the predictions as high or low and comparing to literature values
graph_count_correct = 0
for i in range(count_total):
    if outputs_df["Predictions"][i] > 0.5 and outputs_df["Category"][i] == 1:
        graph_count_correct += 1
    elif outputs_df["Predictions"][i] < 0.5 and outputs_df["Category"][i] == 0:
        graph_count_correct += 1
print(graph_count_correct/count_total*100, "%")

# checking the accuracy of the predictions
graph_NN_TP = ((outputs_df["Predictions"] > 0.5) & (outputs_df["Category"] == 1)).sum()
graph_NN_FN = ((outputs_df["Predictions"] < 0.5) & (outputs_df["Category"] == 1)).sum()
graph_NN_TN = ((outputs_df["Predictions"] < 0.5) & (outputs_df["Category"] == 0)).sum()
graph_NN_FP = ((outputs_df["Predictions"] > 0.5) & (outputs_df["Category"] == 0)).sum()

# calculating evaluation metrics for the models
graph_NN_sensitivity = graph_NN_TP / (graph_NN_TP + graph_NN_FN)
print(f"Sensitivity of graph neural network: {graph_NN_sensitivity}")

graph_NN_specificity = graph_NN_TN / (graph_NN_TN + graph_NN_FP)
print(f"Specificity of graph neural network: {graph_NN_specificity}")

graph_NN_CCR = ((graph_NN_sensitivity + graph_NN_specificity) / 2) * 100
print(f"Correct classification rate of graph neural network: {graph_NN_CCR}")

print(outputs_df)