In [4]:
'''
This code is for prediction of compounds' activity to inhibit HIV replication, using ECFP
'''
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Evaluator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


class Featurizer:
    def __init__(self, y_column='HIV_active', smiles_col='smiles', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)
    
    def __call__(self, df):
        raise NotImplementedError()
        

class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


def train(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model


def predict(model, X_test):
    return model.predict(X_test)


data = pd.read_csv('SpiFF/dataset/HIV.csv')
featurizer = ECFPFeaturizer(y_column='HIV_active', smiles_col='smiles')

X, y = featurizer(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


roc_auc = Evaluator(name = 'ROC AUC')
accuracy = Evaluator(name = 'Accuracy')



model = train(X_train, y_train)
predictions = predict(model, X_test)  

roc_auc_score = roc_auc(y_test, predictions)
accuracy_score = accuracy(y_test, predictions)
print(f'ROC AUC, accuracy = {roc_auc_score}, {accuracy_score}')



ROC AUC, accuracy = 0.6258228183624241, 0.9696085582300025


In [6]:
'''
This code is for prediction of compounds' activity to inhibit HIV replication, using MACCS
'''
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Evaluator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


class Featurizer:
    def __init__(self, y_column='HIV_active', smiles_col='smiles', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)
    
    def __call__(self, df):
        raise NotImplementedError()

class MACCSFeaturizer(Featurizer):
    def __init__(self, y_column, **kwargs):
        super().__init__(y_column, **kwargs)
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            molecule = Chem.MolFromSmiles(smiles)
            fp = MACCSkeys.GenMACCSKeys(molecule)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


def train(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model


def predict(model, X_test):
    return model.predict(X_test)
    
data = pd.read_csv('SpiFF/dataset/HIV.csv')
featurizer = ECFPFeaturizer(y_column='HIV_active', smiles_col='smiles')

X, y = featurizer(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


roc_auc = Evaluator(name = 'ROC AUC')
accuracy = Evaluator(name = 'Accuracy')



model = train(X_train, y_train)
predictions = predict(model, X_test)  

roc_auc_score = roc_auc(y_test, predictions)
accuracy_score = accuracy(y_test, predictions)
print(f'ROC AUC, accuracy = {roc_auc_score}, {accuracy_score}')



ROC AUC, accuracy = 0.6258228183624241, 0.9696085582300025
