In [3]:
''' 
This code is for prediction of compounds' activity against BACE, using ECFP.
'''

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tdc import Evaluator
from sklearn.ensemble import RandomForestRegressor


class Featurizer:
    def __init__(self, y_column='pIC50', smiles_col='smiles', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)
    
    def __call__(self, df):
        raise NotImplementedError()
        

class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            molecule = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(molecule, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)        
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


def train(X_train, y_train):
    model = RandomForestRegressor(n_jobs=-1)
    model.fit(X_train, y_train)
    return model


def predict(model, X_test):
    return model.predict(X_test)


data = pd.read_csv('SpiFF/dataset/bace.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
featurizer = ECFPFeaturizer(y_column='pIC50', smiles_col = 'smiles')

rmse = Evaluator(name = 'RMSE')
mae = Evaluator(name = 'MAE')
r_squared = Evaluator(name = 'R2')

X_train, y_train = featurizer(train_data)
X_test, y_test = featurizer(test_data)

model = train(X_train, y_train)
predictions = predict(model, X_test) 
rmse_score = rmse(y_test, predictions)
mae_score = mae(y_test, predictions)
r2_score = r_squared(y_test, predictions)
print(f'RMSE, MAE, R2 = {rmse_score}, {mae_score}, {r2_score}')

predictions_df = pd.DataFrame({'pred':predictions}, index=test_data['smiles'])
predictions_df.index.name = 'smiles'
predictions_df.to_csv('BACE_ECFP_predictions.csv')


RMSE, MAE, R2 = 0.7174765611259165, 0.5228267834389282, 0.6996625257084857


In [4]:
''' 
This code is for prediction of compounds' activity against BACE, using MACCS fingerprints.
'''

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from tdc import Evaluator
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


class Featurizer:
    def __init__(self, y_column='pIC50', smiles_col='smiles', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)
    
    def __call__(self, df):
        raise NotImplementedError()

class MACCSFeaturizer(Featurizer):
    def __init__(self, y_column, **kwargs):
        super().__init__(y_column, **kwargs)
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            molecule = Chem.MolFromSmiles(smiles)
            fp = MACCSkeys.GenMACCSKeys(molecule)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


def train(X_train, y_train):
    model = RandomForestRegressor(n_jobs=-1)
    model.fit(X_train, y_train)
    return model


def predict(model, X_test):
    return model.predict(X_test)


data = pd.read_csv('SpiFF/dataset/bace.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
featurizer = ECFPFeaturizer(y_column='pIC50', smiles_col = 'smiles')

rmse = Evaluator(name = 'RMSE')
mae = Evaluator(name = 'MAE')
r_squared = Evaluator(name = 'R2')

X_train, y_train = featurizer(train_data)
X_test, y_test = featurizer(test_data)

model = train(X_train, y_train)
predictions = predict(model, X_test) 
rmse_score = rmse(y_test, predictions)
mae_score = mae(y_test, predictions)
r2_score = r_squared(y_test, predictions)
print(f'RMSE, MAE, R2 = {rmse_score}, {mae_score}, {r2_score}')

predictions_df = pd.DataFrame({'pred':predictions}, index=test_data['smiles'])
predictions_df.index.name = 'smiles'
predictions_df.to_csv('BACE_MACCS_predictions.csv')


RMSE, MAE, R2 = 0.7211115711853513, 0.5330287319309289, 0.6966115680624259
