<a href="https://colab.research.google.com/github/UAMCAntwerpen/2040FBDBIC/blob/master/Class_02/Clustering_and_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install RDKit

In [None]:
!pip install rdkit mols2grid requests

In [None]:
# RDKit chemistry
from rdkit import Chem

# RDKit drawing
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
IPythonConsole.ipython_useSVG = True
rdDepictor.SetPreferCoordGen(True)

# Library to display molecules in a grid
import mols2grid

# Library to download files
import requests

## Linear path-based FP's

In [None]:
mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")

for fp_size in (10, 100, 1024):
  fp = Chem.RDKFingerprint(mol, fpSize=fp_size)
  print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

In [None]:
mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")

for max_path_length in (1,3,5,7):
  fp = Chem.RDKFingerprint(mol, maxPath=max_path_length)
  print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

## Circular FP's (ECFP)

In [None]:
from rdkit.Chem import AllChem
mol = Chem.MolFromSmiles("O1CC(=O)NC1")

for radius in range(1,8):
  fp = AllChem.GetMorganFingerprintAsBitVect(mol,radius,nBits=1024)
  print("Radius", radius, ":", len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")

mol

## MACCS keys

In [None]:
from rdkit.Chem import MACCSkeys

mol = Chem.MolFromSmiles("Oc1ccc(CC(N)C(O)=O)cc1")
fp = MACCSkeys.GenMACCSKeys(mol)
print(len(list(fp.GetOnBits())), "bits ON out of the", len(fp), "bits in total")
print(list(fp.GetOnBits()))
mol

## Calculating similarity

In [None]:
from rdkit import DataStructs

mol1 = Chem.MolFromSmiles("CCOC")
fp1 = Chem.RDKFingerprint(mol1, fpSize=50)
print(fp1.ToBitString())

mol2 = Chem.MolFromSmiles("CCO")
fp2 = Chem.RDKFingerprint(mol2, fpSize=50)
print(fp2.ToBitString())

tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
print(tanimoto)

In [None]:
smiles = ["CO", "CCCO", "CCCOCCC"]
mols = []
for s in smiles: mols.append(Chem.MolFromSmiles(s))
fps = []
for mol in mols: fps.append(Chem.RDKFingerprint(mol))
ref = Chem.RDKFingerprint(Chem.MolFromSmiles("CCCO"))

for fp in fps:
  tversky = DataStructs.TverskySimilarity(ref, fp, 0.1, 0.9)
  print("%.2f" % tversky)

print()
for fp in fps:
  tversky = DataStructs.TverskySimilarity(ref, fp, 0.9, 0.1)
  print("%.2f" % tversky)


## Similarity in practice

In this exercise, a file with 10,000 compounds (SMILES format) is downloaded from the UAMC GitHub repo, and then a similarity search is performed to identify the compound that is most similar to aspirin ("CC(=O)OC1=CC=CC=C1C(=O)O")

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/Compounds_10k.smi"
smiles = requests.get(url).text.split("\n")

Aspirin is the query molecule:

In [None]:
aspirin = Chem.MolFromSmiles("CC(=O)OC1=CC=CC=C1C(=O)O")
query = Chem.RDKFingerprint(aspirin)
aspirin

Loop over all molecules and keep the one with the highest similarity to aspirin:

In [None]:
max_sim = 0.0
best_hit = ""
for s in smiles:
  mol = Chem.MolFromSmiles(s)
  fp = Chem.RDKFingerprint(mol)
  tanimoto = DataStructs.FingerprintSimilarity(fp, query)
  if tanimoto >= max_sim:
    max_sim = tanimoto
    best_hit = s

Show the best molecule:

In [None]:
mol = Chem.MolFromSmiles(best_hit) 
print(max_sim)
mol

Note: you can also use other similarity metrics such as the Tversky coefficient. Try this out to see how the results will alter.

## Maximal common substructure (MCSS)

In [None]:
from rdkit.Chem import rdFMCS

morphine = Chem.MolFromSmiles("CN1CC[C@]23C4=C5C=CC(O)=C4O[C@H]2[C@H](C=C[C@H]3[C@H]1C5)O")
codeine = Chem.MolFromSmiles("CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(O[C@H]3[C@@H](O)C=C4)=C(OC)C=C5")
heroine = Chem.MolFromSmiles("CN([C@H](CC(C=C1)=C23)[C@@H]4C=C[C@@H]5OC(C)=O)CC[C@]43[C@H]5OC2=C1OC(C)=O")

mols = [morphine, codeine, heroine]
mcss = rdFMCS.FindMCS(mols)
Chem.MolFromSmarts(mcss.smartsString)

## Clustering

Read in six molecules:

In [None]:
smiles = ["c1ccccc1", "c1cccnc1", "c1ncncc1", "C1CC1", "CC=O", "NCC"]
mols = [Chem.MolFromSmiles(x) for x in smiles]
fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=20) for x in mols]
display(Draw.MolsToGridImage(mols, molsPerRow=3))

Show their fingerprints:

In [None]:
import numpy as np
for i in range(len(fps)): print("%s %s" % (fps[i].ToBitString(), smiles[i]))

Convert the fingerprints to a format that is useable by a clustering algorithm:

In [None]:
nps = [np.array(x) for x in fps]
X = np.array(nps)
print(X)

Do an hierarchical clustering:

In [None]:
import sklearn
from sklearn.cluster import AgglomerativeClustering
clusterEngine = AgglomerativeClustering(n_clusters = 3)
clusterEngine.fit(X)

labels = [str(x) for x in clusterEngine.labels_]
display(Draw.MolsToGridImage(mols, molsPerRow=3, legends=labels))

And now non-hierarchical clustering (k-means):

In [None]:
from sklearn.cluster import KMeans
clusterEngine = KMeans(n_clusters = 3)
clusterEngine.fit(X)

labels = [str(x) for x in clusterEngine.labels_]
display(Draw.MolsToGridImage(mols, molsPerRow=3, legends=labels))

In [None]:
centers = clusterEngine.cluster_centers_
print(centers)

# Machine learning: QSAR models

## An example of a simple model: linear regression

Read in a dataset of DPP4 inhibitors with corresponding pIC50 inhibition constants:

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/dpp4.pIC50.txt"
data = requests.get(url).text.split("\n")
print(data[0])

Split into smiles, mols, fps and pIC50:

In [None]:
mols = []
smiles = []
fps = []
pic50 = []
for d in data:
  fields = d.split()
  if len(fields) < 1: continue
  smiles.append(fields[0])
  pic50.append(float(fields[1]))
  mol = Chem.MolFromSmiles(fields[0])
  mols.append(mol)
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(Chem.RDKFingerprint(mol), fp)
  fps.append(fp)
print(smiles[0])
print(pic50[0])
print(fps[0])
print(max(pic50))
print(min(pic50))
print(len(smiles))

Create a training set (70%) and a test set (30%):

In [None]:
from sklearn.model_selection import train_test_split

pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3, random_state=42)
print(len(pic50_train), len(pic50_test))

Train a linear regression model:

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(fps_train, pic50_train)
print(model.coef_)

Apply the trained model on the test set and compare the predicted values with the experimental ones:

In [None]:
pic50_pred = model.predict(fps_test)
print(pic50_pred)

Validate the model by calculating the MSE of the predictions when compared to the true values:

In [None]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

print("MSE = ", mean_squared_error(pic50_test, pic50_pred))
plt.plot(pic50_test, pic50_pred, '.')
plt.xlabel("True values")
plt.ylabel("Predicted values")

Repeat the test/train splitting a number of times in order to get statistics:

In [None]:
for i in range(5):
  pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3)
  model.fit(fps_train, pic50_train)
  pic50_pred = model.predict(fps_test)
  print("MSE = ", mean_squared_error(pic50_test, pic50_pred))

## A more complicated model: neural networks

In [None]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(random_state=1, max_iter=500)
model.fit(fps_train, pic50_train)
pic50_pred = model.predict(fps_test)
print("MSE =", mean_squared_error(pic50_test, pic50_pred))
plt.plot(pic50_test, pic50_pred, '.')
plt.xlabel("True values")
plt.ylabel("Predicted values")

Repeat the test/train splitting a number of times in order to get statistics:

In [None]:
for i in range(3):
  pic50_train, pic50_test, fps_train, fps_test = train_test_split(pic50, fps, test_size=0.3)
  model.fit(fps_train, pic50_train)
  pic50_pred = model.predict(fps_test)
  print("MSE =", mean_squared_error(pic50_test, pic50_pred))

Now built a model using the entire dataset and save for later on:

In [None]:
pic50_predictor = MLPRegressor(max_iter=500)
pic50_predictor.fit(fps, pic50)

## Another model: random forest

Load a DPP4 dataset with actives and non-actives (classification model):

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/dpp4.classified.txt"
data = requests.get(url).text.split("\n")
print(data[0])

Generate fingerprints and make a list of all the activities:

In [None]:
activities = []
fps = []
for d in data:
  if d is None or d == "": continue
  fields = d.split()
  if fields[1] == "ACTIVE": activities.append(1)
  if fields[1] == "INACTIVE": activities.append(0)
  mol = Chem.MolFromSmiles(fields[0])
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(Chem.RDKFingerprint(mol), fp)
  fps.append(fp)

print(len(activities), len(fps))

Train a random forest model:

In [None]:
from sklearn.ensemble import RandomForestClassifier

act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
model = RandomForestClassifier(max_depth=2)
model.fit(fps_train, act_train)

Calculate the accuracy of the generated model:

In [None]:
from sklearn.metrics import accuracy_score

prediction = model.predict(fps_test)
print(accuracy_score(act_test, prediction))

Now optimise the model by exploring the **max_depth** parameter:

In [None]:
for max_depth in range(1,10):
  accuracy = []
  for i in range(10):
    act_train, act_test, fps_train, fps_test = train_test_split(activities, fps, test_size=0.3)
    model = RandomForestClassifier(max_depth=max_depth)
    model.fit(fps_train, act_train)
    prediction = model.predict(fps_test)
    accuracy.append(accuracy_score(act_test, prediction))
  print("Max_depth: %d -> accuracy = %.3f" % (max_depth, np.mean(accuracy)))

Now train a RF model with all the data and max_depth = 5:

In [None]:
model = RandomForestClassifier(max_depth=5)
model.fit(fps, activities)

Apply this model on a database of 100k compounds and search for DPP4 actives:

In [None]:
url = "https://raw.githubusercontent.com/UAMCAntwerpen/2040FBDBIC/master/Topic_02/Compounds_100k.smi"
data = requests.get(url).text.split("\n")
print(data[0])

Calculate fingerprints:

In [None]:
db_fps = []
for d in data:
  if d == "" or d is None: continue
  mol = Chem.MolFromSmiles(d)
  fp = np.zeros((0,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(Chem.RDKFingerprint(mol), fp)
  db_fps.append(fp)

Apply the RF on the database and extract those that are predicted to be active:

In [None]:
prediction = model.predict(db_fps)
hits_smiles = []
hits_fps = []
for i in range(len(prediction)):
  if prediction[i] == 1:
    hits_smiles.append(data[i])
    hits_fps.append(db_fps[i])
print(len(hits_smiles), len(hits_fps))

Score the identified hits with the pIC50 neural network model:

In [None]:
hits_pic50 = pic50_predictor.predict(hits_fps)

Identify the compound with the highest pIC50 and show structure:

In [None]:
best_pic50 = 0
best_smiles = ""
for i in range(len(hits_pic50)):
  if hits_pic50[i] > best_pic50:
    best_pic50 = hits_pic50[i]
    best_smiles = hits_smiles[i]
print(best_pic50)
print(best_smiles)
mol = Chem.MolFromSmiles(best_smiles)
mol

# Validation of machine learning models

## Some performance metrics

Real data:

In [None]:
real = np.array([1,1,1,1,0,0,1,0,1,0,1,0,1,1,0,0,1,0,1,1])

Hypothetical predictions from a "good" model:

In [None]:
good = np.array([1,0,1,1,0,1,1,0,1,0,1,0,1,0,0,0,1,0,1,1])

Hypothetical predictions from a "bad" model:

In [None]:
bad = np.array([0,0,0,0,1,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0])

Hypothetical predictions from a random model:

In [None]:
random = np.random.randint(2, size=20)

Show all:

In [None]:
print("Real  ", real)
print("Good  ", good)
print("Bad   ", bad)
print("Random", random)

Generate a function to calculate the number of true positives:

In [None]:
def TP(real, pred):
  n = 0
  for i in range(len(real)):
    if real[i] == 1 and pred[i] == 1: n += 1
  return float(n)

And now a function to calculate the number of false positives:

In [None]:
def FP(real, pred):
  n = 0
  for i in range(len(real)):
    if real[i] == 0 and pred[i] == 1: n += 1
  return float(n)

The same for true and false negatives:

In [None]:
def TN(real, pred):
  n = 0
  for i in range(len(real)):
    if real[i] == 0 and pred[i] == 0: n += 1
  return float(n)

def FN(real, pred):
  n = 0
  for i in range(len(real)):
    if real[i] == 1 and pred[i] == 0: n += 1
  return float(n)

Apply these four metrics onto the three models:

In [None]:
# 1. Good model
print("Good model")
print("TP", TP(real, good))
print("TN", TN(real, good))
print("FP", FP(real, good))
print("FN", FN(real, good))

# 2. Bad model
print("Bad model")
print("TP", TP(real, bad))
print("TN", TN(real, bad))
print("FP", FP(real, bad))
print("FN", FN(real, bad))

# 3. Random model
print("Random model")
print("TP", TP(real, random))
print("TN", TN(real, random))
print("FP", FP(real, random))
print("FN", FN(real, random))

Rates:

In [None]:
def TPR(real, pred):
  tp = TP(real, pred)
  fn = FN(real, pred)
  return tp / (tp + fn)

def TNR(real, pred):
  tn = TN(real, pred)
  fp = FP(real, pred)
  return tn / (tn + fp)

def FPR(real, pred):
  fp = FP(real, pred)
  tn = TN(real, pred)
  return fp / (fp + tn)

def FNR(real, pred):
  fn = FN(real, pred)
  tp = TP(real, pred)
  return fn / (fn + tp)

In [None]:
# 1. Good model
print("Good model")
print("TPR", TPR(real, good))
print("TNR", TNR(real, good))
print("FPR", FPR(real, good))
print("FNR", FNR(real, good))

# 2. Bad model
print("Bad model")
print("TPR", TPR(real, bad))
print("TNR", TNR(real, bad))
print("FPR", FPR(real, bad))
print("FNR", FNR(real, bad))

# 3. Random model
print("Random model")
print("TPR", TPR(real, random))
print("TNR", TNR(real, random))
print("FPR", FPR(real, random))
print("FNR", FNR(real, random))

Illustrating the trade-off between sensitivity (TPR), number of false negatives (FN) and true positives (TP):

In [None]:
real = np.array([0,1]*30)

sensitivities = []
true_positives = []
false_negatives = []
for i in range(10000):
  model = np.random.randint(2, size=60)
  sensitivities.append(TPR(real, model))
  false_negatives.append(FN(real, model))
  true_positives.append(TP(real, model))
plt.scatter(sensitivities, false_negatives, c="blue", alpha=0.5)
plt.scatter(sensitivities, true_positives, c="red", alpha=0.5)
plt.xlabel("Sensitivity (TPR)")
plt.ylabel("Number of FN (blue) and TP (red)")

Relation between specificity (TNR), number of false positives (FP) and true negatives (TN):

In [None]:
real = np.array([0,1]*30)

specificities = []
false_positives = []
true_negatives = []
for i in range(10000):
  model = np.random.randint(2, size=60)
  specificities.append(TNR(real,model))
  false_positives.append(FP(real,model))
  true_negatives.append(TN(real,model))
plt.scatter(specificities, false_positives, c="blue", alpha=0.5)
plt.scatter(specificities, true_negatives, c="red", alpha=0.5)
plt.xlabel("Specificity (TNR)")
plt.ylabel("Number of FP (blue) and TN (red)")

Relation between accuracy, precision and sensitivity:

In [None]:
real = np.array([0,1]*30)

P = []
A = []
S = []
for i in range(10000):
  model = np.random.randint(2, size=60)
  tp = TP(real,model)
  fp = FP(real,model)
  tn = TN(real,model)
  fn = FN(real,model)
  precision = (tp / (tp + fp))
  accuracy = (tp + tn) / (tp + fp + fn + tn)
  sensitivity = tp / (tp + fn)
  P.append(precision)
  A.append(accuracy)
  S.append(sensitivity)
plt.scatter(A, S, c="red", alpha=0.5)
plt.scatter(A, P, c="blue", alpha=0.5)
plt.xlabel("Accuracy")
plt.ylabel("Precision (blue) and sensitivity (red)")

TPR and FPR: accuracy as a metric that balances high TPR and low FPR

In [None]:
real = np.array([0,1]*30)

TPRS = []
FPRS = []
ACCU = []
for i in range(10000):
  model = np.random.randint(2, size=60)
  tp = TP(real,model)
  fp = FP(real,model)
  tn = TN(real,model)
  fn = FN(real,model)
  TPRS.append(tp / (tp + fn))
  FPRS.append(fp / (fp + tn))
  ACCU.append((tp + tn) / (tp + fp + fn + tn))
plt.scatter(TPRS, FPRS, c=ACCU)
plt.xlabel("TPR")
plt.ylabel("FPR")
plt.colorbar()