In [1]:
# import necessary packages
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix
import math
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import os
import pickle
from standardiser import standardise

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# define scoring function
def score(y_test,y_pred):
    auc_roc_score = roc_auc_score(y_test, y_pred)
    y_pred_print = [round(y,0) for y in y_pred]
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred_print).ravel()
    se = tp/(tp+fn)
    sp = tn/(tn+fp)
    q = (tp + tn) / (tp + fn + tn + fp)
    mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp))
    P = tp / (tp + fp)
    F1 = (P * se * 2) / (P + se)
    BA = (se + sp) / 2
    return tp, tn, fn, fp, se, sp, mcc, q, auc_roc_score, F1, BA

In [3]:
# load washed dataset
df = pd.read_csv('PIM1_data_washed.csv')

In [4]:
# convert molecule representation from 'smiles' to 'Morgan' 128 bits fingerprint,and export to X dataset
X = np.array([AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 2, nBits=128) 
              for smi in list(df.iloc[:, 0])])

In [5]:
# export 'LABEL' to y dataset
y = df['LABEL'].values

In [6]:
# split dataset to training set and test set with method train_test_split, training set/test set = 8:2 
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=0)

In [7]:
#oversample positive or negtive samples if the dataset is very imbalance
#smote = SMOTE(random_state=1)
#X_train, y_train = smote.fit_resample(X_train, y_train)

In [8]:
# define algorithm
rf = RandomForestClassifier(n_estimators=100)

In [9]:
# train model by fitting X_train with y_train
rf.fit(X_train,y_train)

RandomForestClassifier()

In [10]:
# calculate performance of the model in training set
y_pred = rf.predict(X_train)
score(y_train,y_pred)

(753,
 689,
 1,
 1,
 0.9986737400530504,
 0.9985507246376811,
 0.9972244646907316,
 0.9986149584487535,
 0.9986122323453658,
 0.9986737400530504,
 0.9986122323453658)

In [11]:
# calculate performance of the model in test set
y_pred = rf.predict(X_test)
score(y_test,y_pred)

(161,
 144,
 29,
 28,
 0.8473684210526315,
 0.8372093023255814,
 0.6843997355751965,
 0.8425414364640884,
 0.8422888616891064,
 0.8496042216358839,
 0.8422888616891064)

In [12]:
# export machine learning model to file 'model.pickle'
with open('model.pickle', 'wb') as file:
    pickle.dump(rf, file)

In [13]:
# load valid dataset
df_valid = pd.read_csv('valid.csv')

In [14]:
# standardize molecules in valid dataset
for i in df_valid.index:
    try:
        smi = df_valid.loc[i, 'SMILES']
        # print(smi)
        mol = Chem.MolFromSmiles(smi)
        mol = Chem.AddHs(mol)
        parent = standardise.run(mol)
        mol_ok_smi = Chem.MolToSmiles(parent)
        df_valid.loc[i, 'SMILES'] = mol_ok_smi
        # print(i, 'done')
    except standardise.StandardiseException as e:
        logging.warning(e.message)

[14:43:50] Explicit valence for atom # 0 O, 3, is greater than permitted
[14:43:50] Explicit valence for atom # 0 O, 3, is greater than permitted


In [15]:
# convert molecule representation from 'smiles' to 'Morgan' 128 bits fingerprint,and export to X_valid
X_valid = np.array([AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 2, nBits=128) 
              for smi in list(df_valid.iloc[:, 0])])

In [16]:
# export 'LABEL' to y_valid dataset
y_valid = df_valid['Lable'].values

In [17]:
# calculate performance of the model in valid set
y_valid_pred = rf.predict(X_valid)
score(y_valid,y_valid_pred)

(3,
 12,
 4,
 0,
 0.42857142857142855,
 1.0,
 0.5669467095138409,
 0.7894736842105263,
 0.7142857142857143,
 0.6,
 0.7142857142857143)

In [18]:
y_valid

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [19]:
y_valid_pred

array([0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])