In [1]:
# import necessary packages
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix
import math
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import os
import pickle
from standardiser import standardise

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# define scoring function
def score(y_test,y_pred):
    auc_roc_score = roc_auc_score(y_test, y_pred)
    y_pred_print = [round(y,0) for y in y_pred]
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred_print).ravel()
    se = tp/(tp+fn)
    sp = tn/(tn+fp)
    q = (tp + tn) / (tp + fn + tn + fp)
    mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp))
    P = tp / (tp + fp)
    F1 = (P * se * 2) / (P + se)
    BA = (se + sp) / 2
    return tp, tn, fn, fp, se, sp, mcc, q, auc_roc_score, F1, BA

In [3]:
# import machine learning model from file 'model.pickle'
with open('model.pickle', 'rb') as file:
   rf=pickle.load(file)

In [4]:
# load valid dataset
df_valid = pd.read_csv('valid.csv')
# standardize molecules in valid dataset
for i in df_valid.index:
    try:
        smi = df_valid.loc[i, 'SMILES']
        # print(smi)
        mol = Chem.MolFromSmiles(smi)
        mol = Chem.AddHs(mol)
        parent = standardise.run(mol)
        mol_ok_smi = Chem.MolToSmiles(parent)
        df_valid.loc[i, 'SMILES'] = mol_ok_smi
        # print(i, 'done')
    except standardise.StandardiseException as e:
        logging.warning(e.message)
# convert molecule representation from 'smiles' to 'Morgan' 128 bits fingerprint,and export to X_valid
X_valid = np.array([AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 2, nBits=128) 
              for smi in list(df_valid.iloc[:, 0])])
# export 'LABEL' to y_valid dataset
y_valid = df_valid['Lable'].values

[14:48:26] Explicit valence for atom # 0 O, 3, is greater than permitted
[14:48:26] Explicit valence for atom # 0 O, 3, is greater than permitted


In [5]:
# calculate performance of the model in valid set
y_valid_pred = rf.predict(X_valid)
score(y_valid,y_valid_pred)

(3,
 12,
 4,
 0,
 0.42857142857142855,
 1.0,
 0.5669467095138409,
 0.7894736842105263,
 0.7142857142857143,
 0.6,
 0.7142857142857143)

In [6]:
y_valid

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [7]:
y_valid_pred

array([0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])