<a href="https://colab.research.google.com/github/ShubhangiSRG24/ML_compound-classification/blob/main/Compound_Classification_ML_ChemoInformatics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## My implementation for compound classification

# Compound Classification

Random Forest with the Morgan fingerprint as our feature vector.

In [None]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.2


In [None]:
import numpy as np
import pandas as pd
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

## Data

Let's load the compound data file.

In [None]:
cmpd_df = pd.read_csv('cmpd.csv')
cmpd_df.head()

Unnamed: 0,inchikey,smiles,group,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active


In [None]:
cmpd_df.shape

(5530, 4)

There are 5530 compound samples with:
* SMILES - 2D compound structure,
* InChIKey - a hash from InChI,
* group - a tag to split the dataset into train and test
* activity - y label

In [None]:
cmpd_df['mol'] = cmpd_df.smiles.apply(Chem.MolFromSmiles)

In [None]:
# with minimal modification, we obtain the fingerprint vector using RDKit

def get_Xy(df):
    X = np.vstack(df.mol.apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048))))
    y = df.activity.eq('active').astype(float).to_numpy()
    return X, y

In [None]:
X_train, y_train = get_Xy(cmpd_df[cmpd_df.group.eq('train')])
X_test, y_test = get_Xy(cmpd_df[cmpd_df.group.eq('test')])

#Random Forest

RF is the simplest classifier for numerical feature vectors without much tuning.

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8615582743077914

In [None]:
y_pred = clf.predict_proba(X_test)[:, 1]

In [None]:
# logloss
metrics.log_loss(y_test, y_pred, labels=[0, 1])

0.42520158976799344

In [None]:
# AUC PRC
precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred, pos_label=1)
metrics.auc(recall, precision)

0.8798927782587836

In [None]:
# AUC ROC
fpr_roc, tpr_roc, _ = metrics.roc_curve(y_test, y_pred, pos_label=1)
metrics.auc(fpr_roc, tpr_roc)

0.8904297473028224

## Implementation --> Simple neural network model for binary classification implementation

In [None]:
# Install required libraries
!pip install torch rdkit-pypi pandas scikit-learn torch-scatter

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.1.2-cp310-cp310-linux_x86_64.whl size=495091 sha256=41ae9d472f693a738b98d828a04f607ed285ec469607360bd41105cf2e319f04
  Stored in directory: /root/.cache/pip/wheels/92/f1/2b/3b46d54b134259f58c8363568569053248040859b1a145b3ce
Successfully built torch-scatter
Installing collected packages: torch-scatter, rdkit-pypi
Su

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch_scatter import scatter_add
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('cmpd.csv')

print(df.shape)
print(df['activity'].value_counts())


(5530, 4)
active          2704
inactive        1886
unknown          599
intermediate     341
Name: activity, dtype: int64


In [None]:
df.head()

Unnamed: 0,inchikey,smiles,group,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active


In [None]:
# SMILES strings to molecular graphs
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# get their molecular weight
df['molecular_weight'] = df['mol'].apply(Descriptors.MolWt)

# get their Morgan fingerprints
df['morgan_fingerprint'] = df['mol'].apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048)))

In [None]:
# Combine molecular weight and Morgan fingerprints as features
X_molecular_weight = torch.tensor(df['molecular_weight'].values, dtype=torch.float32).view(-1, 1)
X_morgan_fingerprint = torch.tensor(np.vstack(df['morgan_fingerprint']), dtype=torch.float32)
X_combined = torch.cat([X_molecular_weight, X_morgan_fingerprint], dim=1)

print(X_combined.shape)
print(X_morgan_fingerprint.shape)

torch.Size([5530, 2049])
torch.Size([5530, 2048])


In [None]:
# Convert 'activity' column to binary (0/1) labels
df['label'] = df['activity'].apply(lambda x: 1 if x == 'active' else 0)
y = torch.tensor(df['label'].values, dtype=torch.float32)

print(y.shape)

torch.Size([5530])


## Neural network implemetation when two features are considered - 'morgan fingerprint' and 'molecular weight' is considered of the compound

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [None]:
# Define the NN architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Instantiate the NN model
input_size = X_combined.shape[1]  # X_combined is my input features tensor
hidden_size = 64  # Adjusted as needed
output_size = 1  # Binary classification, single output node with sigmoid activation
nn_model = NeuralNetwork(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    # Forward pass
    outputs = nn_model(X_train)

    # Compute the loss
    loss = criterion(outputs, y_train.view(-1, 1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print training information
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# After training, use the model for predictions on the test set
with torch.no_grad():
    nn_model.eval()
    y_pred_prob = nn_model(X_test).squeeze().numpy()

# Calculate evaluation metrics
aucprc = average_precision_score(y_test.numpy(), y_pred_prob)
aucroc = roc_auc_score(y_test.numpy(), y_pred_prob)
loss = log_loss(y_test.numpy(), y_pred_prob)

# Print evaluation metrics
print(f'AUCPRC: {aucprc:.4f}')
print(f'AUCROC: {aucroc:.4f}')
print(f'Log Loss: {loss:.4f}')

Epoch [1/100], Loss: 0.6876
Epoch [2/100], Loss: 0.7802
Epoch [3/100], Loss: 0.6609
Epoch [4/100], Loss: 0.6793
Epoch [5/100], Loss: 0.6964
Epoch [6/100], Loss: 0.6603
Epoch [7/100], Loss: 0.6173
Epoch [8/100], Loss: 0.6047
Epoch [9/100], Loss: 0.6140
Epoch [10/100], Loss: 0.6126
Epoch [11/100], Loss: 0.5919
Epoch [12/100], Loss: 0.5672
Epoch [13/100], Loss: 0.5543
Epoch [14/100], Loss: 0.5536
Epoch [15/100], Loss: 0.5531
Epoch [16/100], Loss: 0.5441
Epoch [17/100], Loss: 0.5288
Epoch [18/100], Loss: 0.5152
Epoch [19/100], Loss: 0.5087
Epoch [20/100], Loss: 0.5067
Epoch [21/100], Loss: 0.5031
Epoch [22/100], Loss: 0.4949
Epoch [23/100], Loss: 0.4843
Epoch [24/100], Loss: 0.4758
Epoch [25/100], Loss: 0.4712
Epoch [26/100], Loss: 0.4683
Epoch [27/100], Loss: 0.4638
Epoch [28/100], Loss: 0.4570
Epoch [29/100], Loss: 0.4495
Epoch [30/100], Loss: 0.4437
Epoch [31/100], Loss: 0.4400
Epoch [32/100], Loss: 0.4367
Epoch [33/100], Loss: 0.4321
Epoch [34/100], Loss: 0.4265
Epoch [35/100], Loss: 0

## Neural network implementation when only one feature - 'morgan fingerprint' is considered of the compound

In [None]:
# Split the data into training and testing sets
Xmf_train, Xmf_test, ymf_train, ymf_test = train_test_split(X_morgan_fingerprint, y, test_size=0.2, random_state=42)

In [None]:
# Define the GNN architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Instantiate the GNN model
input_size = X_morgan_fingerprint.shape[1]  # X_morgan_fingerprint is my input feature tensor
hidden_size = 64  # Adjust as needed
output_size = 1  # Binary classification
nn_model = NeuralNetwork(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):

    # Forward pass
    outputs = nn_model(Xmf_train)  # Use X_morgan_fingerprint directly

    # Print sizes for debugging
    print(outputs.size(), ymf_train.view(-1, 1).size())

    # Compute the loss
    loss = criterion(outputs, ymf_train.view(-1, 1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print training information
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# After training, use the model for predictions on the test set
with torch.no_grad():
    nn_model.eval()
    y_pred_prob = nn_model(Xmf_test).squeeze().numpy()  # Use X_morgan_fingerprint_test for test set

# Calculate evaluation metrics
aucprc = average_precision_score(y_test.numpy(), y_pred_prob)
aucroc = roc_auc_score(y_test.numpy(), y_pred_prob)
loss = log_loss(y_test.numpy(), y_pred_prob)

# Print evaluation metrics
print(f'AUCPRC: {aucprc:.4f}')
print(f'AUCROC: {aucroc:.4f}')
print(f'Log Loss: {loss:.4f}')


torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [1/100], Loss: 0.6913
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [2/100], Loss: 0.6695
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [3/100], Loss: 0.6479
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [4/100], Loss: 0.6248
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [5/100], Loss: 0.6009
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [6/100], Loss: 0.5770
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [7/100], Loss: 0.5538
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [8/100], Loss: 0.5316
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [9/100], Loss: 0.5102
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [10/100], Loss: 0.4899
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [11/100], Loss: 0.4707
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [12/100], Loss: 0.4528
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [13/100], Loss: 0.4361
torch.Size([4424, 1]) torch.Size([4424, 1])
Epoch [14/100], 