In [None]:
### total import statements
! pip install chemprop
import random
import numpy as np
import pandas as pd
! pip install pytorch-lightning wandb rdkit ogb deepchem
import torch
VERSION = torch.__version__
! pip install pyg_lib torch_scatter torch_sparse -f https://data.pyg.org/whl/torch-{VERSION}.html
! pip install torch-geometric
! mkdir data/
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG = True  # < use SVGs instead of PNGs
IPythonConsole.drawOptions.addAtomIndices = True  # adding indices for atoms
IPythonConsole.drawOptions.addBondIndices = False  # not adding indices for bonds
IPythonConsole.molSize = 200, 200
# Random Seeds and Reproducibility
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)
! pip install deepchem.data
import torch.nn.functional as F
from torch.nn import GRU
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv, MLP, global_add_pool
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
from tqdm import tqdm
import pandas as pd
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
)
from ogb.utils import smiles2graph
from deepchem.splits import RandomSplitter
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool
from deepchem.feat import RDKitDescriptors
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import wandb
! pip install mordred
from mordred import Calculator, descriptors

In [None]:
### importing data and normalising target
df = pd.read_csv('https://raw.githubusercontent.com/NinaB99/AI-for-Chemistry/main/Data/Bioavailibility.csv')
smiles = df['SMILES'].values.tolist()
category = df['Category'].values.tolist()
y = df['logK(%F)'].values
# Here, we removed all zero-variance features, i.e. features that have the same value in all samples.

splitter = RandomSplitter()

# Normalize target to mean = 0 and std = 1.
mean = y.mean()
std = y.std()
y = (y - mean) / std


In [None]:
### Using the Mordred calculator and preprocessing the data
calc = Calculator(descriptors, ignore_3D=True)
#extracting the SMILES from the imported data and calculating
mols = [MolFromSmiles(smi) for smi in smiles]
Mordred_features_df =  calc.pandas(mols)
Mordred_features = Mordred_features_df.to_numpy()
# print(type(Mordred_features)) # use for debugging
Mordred_features = Mordred_features.astype(float)

# removing invalid values
Mordred_features = Mordred_features[:, ~pd.isnull(Mordred_features).any(axis=0)]
print(f"Number of molecular descriptors without invalid values: {Mordred_features.shape[1]}")

#Removing zero variance features
selector = VarianceThreshold(threshold=0.0)
Mordred_features = selector.fit_transform(Mordred_features)
print(f"Number of molecular descriptors after removing zero-variance features: {Mordred_features.shape[1]}")
print(Mordred_features[0,:])
# convert features to tensor for processing
Mordred_molecular_features = torch.tensor(Mordred_features, dtype=torch.float32)

In [None]:
### splitting and preprocessing
X = Mordred_features
# training data size : test data size = 0.8 : 0.2
# fixed seed using the random_state parameter, so it always has the same split.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=0)

scaler = MinMaxScaler()
scaler.fit(X_train)

# save original X
X_train_ori = X_train
X_test_ori = X_test
# transform data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
### coding the model training
def train_test_model(model, X_train, y_train, X_test, y_test):
    """
    Function that trains a model, and tests it.
    Inputs: sklearn model, train_data, test_data
    """
    # Train model
    model.fit(X_train, y_train)
    
    # Calculate RMSE on training
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    model_train_mse = mean_squared_error(y_train, y_pred_train)
    model_test_mse = mean_squared_error(y_test, y_pred_test)
    model_train_rmse = model_train_mse ** 0.5
    model_test_rmse = model_test_mse ** 0.5
    print(f"RMSE on train set: {model_train_rmse:.3f}, and test set: {model_test_rmse:.3f}.\n")


In [None]:
### optimising the Random Forest and XGBoost model using GridSearchCV
param_grid = {
    'n_estimators': [85,90,95,100,150,200],
    'max_depth': [5,9,13,17,21,25,29,33,37,41],
}
from sklearn.model_selection import GridSearchCV
# use 5-folds cross validation during grid searching
grid_search_RF = GridSearchCV(
    RandomForestRegressor(random_state=0),
    param_grid,
    cv=5,
    # will print out how long each step takes
    verbose = 2
)
grid_search_RF.fit(X_train, y_train)

# re-train a model using best hyperparameters
rf_gs = RandomForestRegressor(**grid_search_RF.best_params_, random_state=0)

print('Best paramters(Random Forest): ', grid_search_RF.best_params_)
print('Random forests performance after hyperparamter optimization:')
train_test_model(rf_gs, X_train, y_train, X_test, y_test)

# use 5-folds cross validation during grid searching
grid_search_XBG= GridSearchCV(
    XGBRegressor(random_state=0),
    param_grid,
    cv=5,
    # will print out how long each step takes
    verbose = 2
)
grid_search_XBG.fit(X_train, y_train)

# re-train a model using best hyperparameters
xgb_gs = XGBRegressor(**grid_search_XBG.best_params_, random_state=0)

print('Best paramters(XGBoost): ', grid_search_XBG.best_params_)
print('XGBoost performance after hyperparamter optimization:')
train_test_model(xgb_gs, X_train, y_train, X_test, y_test)

In [None]:
### evaluating the models
RF_predictions = rf_gs.predict(Mordred_molecular_features)

XGB_predictions = xgb_gs.predict(Mordred_molecular_features)

outputs_df = pd.DataFrame([RF_predictions,
                          XGB_predictions,
                          category])
outputs_df = outputs_df.transpose()
outputs_df = outputs_df.reset_index(drop=True)

# count correct predictions
count_total = len(outputs_df[0])

# comparing predictions with real values
RF_count_correct = 0
for i in range(count_total):
    if outputs_df[0][i] > 0.5 and outputs_df[2][i] == 1:
        RF_count_correct += 1
    elif outputs_df[0][i] < 0.5 and outputs_df[2][i] == 0:
        RF_count_correct += 1
print(RF_count_correct/count_total)

XGB_count_correct = 0
for i in range(count_total):
    if outputs_df[1][i] > 0.5 and outputs_df[2][i] == 1:
        XGB_count_correct += 1
    elif outputs_df[1][i] < 0.5 and outputs_df[2][i] == 0:
        XGB_count_correct += 1
print(XGB_count_correct/count_total)

# print(outputs_df) # sometimes useful

# Calculating the True/False positives and negatives
RF_TP = ((outputs_df[0] > 0.5) & (outputs_df[2] == 1)).sum()
RF_FN = ((outputs_df[0] < 0.5) & (outputs_df[2] == 1)).sum()
RF_TN = ((outputs_df[0] < 0.5) & (outputs_df[2] == 0)).sum()
RF_FP = ((outputs_df[0] > 0.5) & (outputs_df[2] == 0)).sum()

# evaluating the success of the model
RF_sensitivity = RF_TP/(RF_TP+RF_FN)
print(f"Sensitivity of random forest model: {RF_sensitivity}")

RF_specificity = RF_TN/(RF_TN+RF_FP)
print(f"Specificity of random forest model: {RF_specificity}")

RF_CCR = ((RF_sensitivity+RF_specificity)/2)*100
print(f"Correct classfication rate of random forest model: {RF_CCR}")

# Calculating the True/False positives and negatives
XGB_TP = ((outputs_df[1] > 0.5) & (outputs_df[2] == 1)).sum()
XGB_FN = ((outputs_df[1] < 0.5) & (outputs_df[2] == 1)).sum()
XGB_TN = ((outputs_df[1] < 0.5) & (outputs_df[2] == 0)).sum()
XGB_FP = ((outputs_df[1] > 0.5) & (outputs_df[2] == 0)).sum()

# evaluating the succes of the model
XGB_sensitivity = XGB_TP/(XGB_TP+XGB_FN)
print(f"Sensitivity of XG Boost model: {XGB_sensitivity}")

XGB_specificity = XGB_TN/(XGB_TN+XGB_FP)
print(f"Specificity of XG Boost model: {XGB_specificity}")

XGB_CCR = ((XGB_sensitivity+XGB_specificity)/2)*100
print(f"Correct classfication rate of XG Boost model: {XGB_CCR}")

print(outputs_df)