<a href="https://colab.research.google.com/github/UAMCAntwerpen/2040FBDBIC/blob/master/Topic_02/Clustering_and_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install RDKit

In [None]:
!pip install rdkit mols2grid requests

In [None]:
# RDKit chemistry
from rdkit import Chem

# RDKit drawing
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
IPythonConsole.ipython_useSVG = True
rdDepictor.SetPreferCoordGen(True)

# Library to display molecules in a grid
import mols2grid

# Library to download files
import requests

## Linear path-based FP's

In [None]:
from rdkit.Chem import rdFingerprintGenerator

mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")

for fp_size in (10, 100, 1024):
  rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=fp_size)
  fp = rdkgen.GetFingerprint(mol)
  print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

In [None]:
mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")

for max_path_length in (1,3,5,7):
  rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=max_path_length)
  fp = rdkgen.GetFingerprint(mol)
  print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

## Circular FP's (ECFP)

In [None]:
mol = Chem.MolFromSmiles("O1CC(=O)NC1")

for radius in range(1,8):
  mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=1024)
  fp = mfpgen.GetFingerprint(mol)
  print("Radius", radius, ":", len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

## MACCS keys

In [None]:
from rdkit.Chem import MACCSkeys

mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")
fp = MACCSkeys.GenMACCSKeys(mol)
print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")
print(list(fp.GetOnBits()))
mol

## Calculating similarity

In [None]:
from rdkit import DataStructs

rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=50)

mol1 = Chem.MolFromSmiles("CCOC")
fp1 = rdkgen.GetFingerprint(mol1)
print(fp1.ToBitString())

mol2 = Chem.MolFromSmiles("CCO")
fp2 = rdkgen.GetFingerprint(mol2)
print(fp2.ToBitString())

tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
print(tanimoto)

In [None]:
smiles = ["CO", "CCCO", "CCCOCCC"]
mols = []
for s in smiles: mols.append(Chem.MolFromSmiles(s))

rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=50)
fps = []
for mol in mols: fps.append(rdkgen.GetFingerprint(mol))
ref = rdkgen.GetFingerprint(Chem.MolFromSmiles("CCCO"))

for fp in fps:
  tversky = DataStructs.TverskySimilarity(ref, fp, 0.1, 0.9)
  print("%.2f" % tversky)

print()
for fp in fps:
  tversky = DataStructs.TverskySimilarity(ref, fp, 0.9, 0.1)
  print("%.2f" % tversky)


## Similarity in practice

In this exercise, a file with 10,000 compounds (SMILES format) is downloaded from the UAMC GitHub repo, and then a similarity search is performed to identify the compound that is most similar to aspirin ("CC(=O)OC1=CC=CC=C1C(=O)O")

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/Compounds_10k.smi"
smiles = requests.get(url).text.split("\n")
print(len(smiles))

Aspirin is the query molecule:

In [None]:
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)
aspirin = Chem.MolFromSmiles("CC(=O)OC1=CC=CC=C1C(=O)O")
query = rdkgen.GetFingerprint(aspirin)
aspirin

Loop over all molecules and keep the one with the highest similarity to aspirin:

In [None]:
max_sim = 0.0
best_hit = ""
for s in smiles:
  mol = Chem.MolFromSmiles(s)
  fp = rdkgen.GetFingerprint(mol)
  tanimoto = DataStructs.FingerprintSimilarity(fp, query)
  print(tanimoto)
  if tanimoto >= max_sim:
    max_sim = tanimoto
    best_hit = s

Show the best molecule:

In [None]:
mol = Chem.MolFromSmiles(best_hit)
print(max_sim)
mol

Note: you can also use other similarity metrics such as the Tversky coefficient. Try this out to see how the results will alter.

## Maximal common substructure (MCSS)

In [None]:
from rdkit.Chem import rdFMCS

morphine = Chem.MolFromSmiles("CN1CC[C@]23C4=C5C=CC(O)=C4O[C@H]2[C@H](C=C[C@H]3[C@H]1C5)O")
codeine = Chem.MolFromSmiles("CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(O[C@H]3[C@@H](O)C=C4)=C(OC)C=C5")
heroine = Chem.MolFromSmiles("CN([C@H](CC(C=C1)=C23)[C@@H]4C=C[C@@H]5OC(C)=O)CC[C@]43[C@H]5OC2=C1OC(C)=O")

mols = [morphine, codeine, heroine]
mcss = rdFMCS.FindMCS(mols)
Chem.MolFromSmarts(mcss.smartsString)

## Clustering

Read in six molecules:

In [None]:
smiles = ["c1ccccc1", "c1cccnc1", "c1ncncc1", "C1CC1", "CC=O", "NCC"]
mols = [Chem.MolFromSmiles(x) for x in smiles]

mfpgen = rdFingerprintGenerator.GetMorganGenerator(fpSize=20)
fps = [mfpgen.GetFingerprint(mol) for mol in mols]
display(Draw.MolsToGridImage(mols, molsPerRow=3))

Show their fingerprints:

In [None]:
import numpy as np
for i in range(len(fps)): print("%s %s" % (fps[i].ToBitString(), smiles[i]))

Convert the fingerprints to a format that is useable by a clustering algorithm:

In [None]:
nps = [np.array(x) for x in fps]
X = np.array(nps)
print(X)

Do an hierarchical clustering:

In [None]:
import sklearn
from sklearn.cluster import AgglomerativeClustering
clusterEngine = AgglomerativeClustering(n_clusters = 3)
clusterEngine.fit(X)

labels = [str(x) for x in clusterEngine.labels_]
display(Draw.MolsToGridImage(mols, molsPerRow=3, legends=labels))

And now non-hierarchical clustering (k-means):

In [None]:
from sklearn.cluster import KMeans
clusterEngine = KMeans(n_clusters = 3)
clusterEngine.fit(X)

labels = [str(x) for x in clusterEngine.labels_]
display(Draw.MolsToGridImage(mols, molsPerRow=3, legends=labels))

In [None]:
centers = clusterEngine.cluster_centers_
print(centers)

# Machine learning: QSAR models

## An example of a simple model: linear regression

Read in a dataset of DPP4 inhibitors with corresponding pIC50 inhibition constants:

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/dpp4.pIC50.txt"
data = requests.get(url).text.split("\n")
print(data[0])

Split into smiles, mols, fps and pIC50:

In [None]:
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)

mols = []
smiles = []
fps = []
pic50 = []
for d in data:
  fields = d.split()
  if len(fields) < 1: continue
  smiles.append(fields[0])
  pic50.append(float(fields[1]))
  mol = Chem.MolFromSmiles(fields[0])
  mols.append(mol)
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(rdkgen.GetFingerprint(mol), fp)
  fps.append(fp)
print(smiles[0])
print(pic50[0])
print(fps[0])
print(max(pic50))
print(min(pic50))
print(len(smiles))

Create a training set (70%) and a test set (30%):

In [None]:
from sklearn.model_selection import train_test_split

pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3, random_state=42)
print(len(pic50_train), len(pic50_test))

Train a linear regression model:

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(fps_train, pic50_train)
print(model.coef_)

Apply the trained model on the test set and compare the predicted values with the experimental ones:

In [None]:
pic50_pred = model.predict(fps_test)
print(pic50_pred)

Validate the model by calculating the MSE of the predictions when compared to the true values:

In [None]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

print("MSE = ", mean_squared_error(pic50_test, pic50_pred))
plt.plot(pic50_test, pic50_pred, '.')
plt.xlabel("True values")
plt.ylabel("Predicted values")

Repeat the test/train splitting a number of times in order to get statistics:

In [None]:
for i in range(5):
  pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3)
  model.fit(fps_train, pic50_train)
  pic50_pred = model.predict(fps_test)
  print("MSE = ", mean_squared_error(pic50_test, pic50_pred))

## A more complicated model: neural networks

In [None]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(random_state=1, max_iter=500)
model.fit(fps_train, pic50_train)
pic50_pred = model.predict(fps_test)
print("MSE =", mean_squared_error(pic50_test, pic50_pred))
plt.plot(pic50_test, pic50_pred, '.')
plt.xlabel("True values")
plt.ylabel("Predicted values")

Repeat the test/train splitting a number of times in order to get statistics:

In [None]:
for i in range(3):
  pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3)
  model.fit(fps_train, pic50_train)
  pic50_pred = model.predict(fps_test)
  print("MSE =", mean_squared_error(pic50_test, pic50_pred))

Now built a model using the entire dataset and save for later on:

In [None]:
pic50_predictor = MLPRegressor(max_iter=500)
pic50_predictor.fit(fps, pic50)

## Another model: random forest

Load a DPP4 dataset with actives and non-actives (classification model):

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/dpp4.classified.txt"
data = requests.get(url).text.split("\n")
print(data[0])

Generate fingerprints and make a list of all the activities:

In [None]:
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)

activities = []
fps = []
for d in data:
  if d is None or d == "": continue
  fields = d.split()
  if fields[1] == "ACTIVE": activities.append(1)
  if fields[1] == "INACTIVE": activities.append(0)
  mol = Chem.MolFromSmiles(fields[0])
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(rdkgen.GetFingerprint(mol), fp)
  fps.append(fp)

print(len(activities), len(fps))

Let's count how many actives and inactives:

In [None]:
n_actives = 0
n_inactives = 0
for a in activities:
  if a == 1: n_actives += 1
  if a == 0: n_inactives += 1
print(f'Actives: {n_actives}')
print(f'Inactives: {n_inactives}')

Train a random forest model to predict activity:

In [None]:
from sklearn.ensemble import RandomForestClassifier

act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
model = RandomForestClassifier(max_depth=2)
model.fit(fps_train, act_train)

Calculate the accuracy of the generated model:

In [None]:
from sklearn.metrics import accuracy_score

prediction = model.predict(fps_test)
print(accuracy_score(act_test, prediction))

**Precision versus recall**

The **precision** metric is defined by TP / (TP + FP). A high precision is therefore obtained when the model **reduces the number of false positives**. A model with a high precision is useful in a situation where one wants to be sure that, if the model predicts a compound to be active, the compound is really active, however at the expense of missing some other actives. This might be useful when one wants to purchase compounds with a limited budget.

The **recall** metric is defined by TP / (TP + FN). A high recall is therefore obtained when the model **reduces the number of false negatives**. A model with a high recall is therefore useful in a situation where one wants to be sure that **all** active compounds in a database will be purchased, however at the expense of buying also compounds which are not active. This might be useful when one can purchase compounds with an unlimited budget.

Let us make a random forest model with a **high precision**. We can do this by tweaking some of the parameters in the random forest classifier. The main parameters to adjust when using these methods is *n_estimators* and *max_features*. The former is the number of trees in the forest. The larger the better, but also the longer it will take to compute. The latter is the size of the random subsets of features to consider when splitting a node. The lower the greater the reduction of variance, but also the greater the increase in bias.

In [None]:
from sklearn.metrics import precision_score

optimal_max_depth = 0
optimal_n_estimators = 0
optimal_max_features = 0
optimal_precision = 0
for max_depth in range(4,10):
  print(f'Max_depth: {max_depth}')
  for n_estimators in range(4,10):
    for max_features in range(4,10):
      precision = []
      for i in range(5): # Repeat 5 times to get average
        act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
        model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)
        model.fit(fps_train, act_train)
        prediction = model.predict(fps_test)
        precision.append(precision_score(act_test, prediction))
      mean_precision = np.mean(precision)
      if mean_precision > optimal_precision:
        optimal_max_depth = max_depth
        optimal_n_estimators = n_estimators
        optimal_max_features = max_features
        optimal_precision = mean_precision
print(f'max_depth: {optimal_max_depth}, n_estimators: {optimal_n_estimators}, max_features: {optimal_max_features} -> precision = {optimal_precision}')

Now that we have the optimal parameters for a **high precision** model, let's train this model and count the number of TP's and FP's in the test set:

In [None]:
from sklearn.metrics import confusion_matrix

TP = []
FP = []
FN = []
for i in range(10): # Repeat 10 times to get average
  act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
  precision_model = RandomForestClassifier(max_depth=optimal_max_depth, n_estimators=optimal_n_estimators, max_features=optimal_max_features)

  precision_model.fit(fps_train, act_train)
  prediction = precision_model.predict(fps_test)

  tn, fp, fn, tp = confusion_matrix(act_test, prediction).ravel().tolist()

  TP.append(tp)
  FP.append(fp)
  FN.append(fn)

print(f'TP: {np.mean(TP)}')
print(f'FP: {np.mean(FP)}')
print(f'FN: {np.mean(FN)}')


Let us make a random forest model with a **high recall**:

In [None]:
from sklearn.metrics import recall_score

optimal_max_depth = 0
optimal_n_estimators = 0
optimal_max_features = 0
optimal_recall = 0
for max_depth in range(4,10):
  print(f'Max_depth: {max_depth}')
  for n_estimators in range(4,10):
    for max_features in range(4,10):
      recall = []
      for i in range(5): # Repeat 5 times to get average
        act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
        model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)
        model.fit(fps_train, act_train)
        prediction = model.predict(fps_test)
        recall.append(recall_score(act_test, prediction))
      mean_recall = np.mean(recall)
      if mean_recall > optimal_recall:
        optimal_max_depth = max_depth
        optimal_n_estimators = n_estimators
        optimal_max_features = max_features
        optimal_recall = mean_recall
print(f'max_depth: {optimal_max_depth}, n_estimators: {optimal_n_estimators}, max_features: {optimal_max_features} -> recall = {optimal_recall}')

Now that we have the optimal parameters for a high recall model, let's train this model and count the number of TP's and FP's in the test set:

In [None]:
TP = []
FP = []
FN = []
for i in range(10): # Repeat 10 times to get average
  act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
  recall_model = RandomForestClassifier(max_depth=optimal_max_depth, n_estimators=optimal_n_estimators, max_features=optimal_max_features)

  recall_model.fit(fps_train, act_train)
  prediction = recall_model.predict(fps_test)

  tn, fp, fn, tp = confusion_matrix(act_test, prediction).ravel().tolist()

  TP.append(tp)
  FP.append(fp)
  FN.append(fn)

print(f'TP: {np.mean(TP)}')
print(f'FP: {np.mean(FP)}')
print(f'FN: {np.mean(FN)}')

Apply this model on a database of 100k compounds and search for DPP4 actives:

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/Compounds_100k.smi"
data = requests.get(url).text.split("\n")
print(data[0])

Calculate fingerprints:

In [153]:
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)

db_fps = []
for d in data:
  if d == "" or d is None: continue
  mol = Chem.MolFromSmiles(d)
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(Chem.RDKFingerprint(mol), fp)
  db_fps.append(fp)

Apply the RF on the database and extract those that are predicted to be active:

In [None]:
prediction = recall_model.predict(db_fps)
hits_smiles = []
hits_fps = []
for i in range(len(prediction)):
  if prediction[i] == 1:
    hits_smiles.append(data[i])
    hits_fps.append(db_fps[i])
print(len(hits_smiles), len(hits_fps))

Score the identified hits with the pIC50 neural network model:

In [None]:
hits_pic50 = pic50_predictor.predict(hits_fps)

Identify the compound with the highest pIC50 and show structure:

In [None]:
best_pic50 = 0
best_smiles = ""
for i in range(len(hits_pic50)):
  if hits_pic50[i] > best_pic50:
    best_pic50 = hits_pic50[i]
    best_smiles = hits_smiles[i]
print(best_pic50)
print(best_smiles)
mol = Chem.MolFromSmiles(best_smiles)
mol