In [None]:
#code by Tobias Gensch with contributions from Cian Kingston
import os,re,sys,pickle,datetime,time,random,itertools
import warnings
warnings.filterwarnings("ignore")
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
from scipy import stats
import math
from tqdm import tqdm
import seaborn as sns 
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA,NMF
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression,Lasso,LinearRegression,Ridge,ElasticNetCV,ElasticNet,Lars,LassoCV,RidgeCV,LarsCV,LassoLarsCV,LassoLarsIC,OrthogonalMatchingPursuitCV,OrthogonalMatchingPursuit
from sklearn.manifold import TSNE,MDS
from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,median_absolute_error
from sklearn.model_selection import train_test_split,GridSearchCV,RepeatedKFold,LeaveOneOut
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.svm import LinearSVC,SVR
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.metrics import silhouette_samples, silhouette_score
import loo_q2 as loo
randomstate = 42
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDConfig
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem import PropertyMol
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing,DrawingOptions
DrawingOptions.bondLineWidth=1.8
DrawingOptions.includeAtomNumbers=False
size = (150, 150)
from rdkit.Chem.Draw.MolDrawing import Font
from rdkit.Chem import rdmolfiles
from rdkit.Chem import rdFMCS
import kennardstonealgorithm 
from PIL import Image

**Load data**

In [None]:
ci = pd.read_excel('gen_descriptors.xlsx','Sheet1',index_col=0,header=1,engine='openpyxl')
compinp = ci[ci.columns[1:-3]].loc[ci.index[:]]
compinp.index = compinp.index.astype(int)
compinp.dropna(axis=0,inplace=True)

inp2 = pd.read_excel("gen_identifiers.xlsx",index_col=1,header=2,engine='openpyxl')
inp2.index = inp2.index.astype(int)

**Remove colinear descriptors**

In [None]:
threshold = 0.95

print('Shape of descriptors file before curation: ',compinp.shape)
#compinp = compinp.astype(float)
df_corr = compinp.corr()
df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
compinp = compinp[un_corr_idx]
print('Shape of descriptors file after curation: ',compinp.shape)

**Create subsets from the identifiers file and scale the data**

In [None]:
X_all = np.array(compinp)
X_gen_considered = np.array(compinp.loc[inp2["gen_considered"]==1])
X_gen_chosen = np.array(compinp.loc[inp2["gen_chosen"]==1])

X_all_ids = np.array(compinp.index)
X_gen_considered_ids = np.array(compinp.loc[inp2["gen_considered"]==1].index)
X_gen_chosen_ids = np.array(compinp.loc[inp2["gen_chosen"]==1].index)

scaler = StandardScaler()
scaler.fit(X_all)
X_all_sc    = scaler.transform(X_all)
X_gen_considered_sc = scaler.transform(X_gen_considered)
X_gen_chosen_sc = scaler.transform(X_gen_chosen)

**Run PCA with fit on X_all_sc for visualization**

In [None]:
pca = PCA(n_components=4)
pca.fit(X_all_sc)
X_all_pca    = pca.transform(X_all_sc)
X_gen_considered_pca = pca.transform(X_gen_considered_sc)
X_gen_chosen_pca = pca.transform(X_gen_chosen_sc)

pca_score = pca.explained_variance_ratio_
pca_values = pca.singular_values_
V = pca.components_

print('Total variance explained by PCs:',round(np.sum(pca.explained_variance_ratio_*100),1),'%\n')
print("Percentage of explained variance per principal component")
for i,j in enumerate(pca_score):
    print(f"PC{i+1}   {j*100:.1f}%")

**Run PCA with fit on X_gen_considered_sc for clustering**

In [None]:
pca = PCA(n_components=4)
pca.fit(X_gen_considered_sc)
X_gen_considered_pca = pca.transform(X_gen_considered_sc)
X_gen_chosen_pca = pca.transform(X_gen_chosen_sc)

print('Total variance explained by PCs:',round(np.sum(pca.explained_variance_ratio_*100),1),'%\n')
print("Percentage of explained variance per principal component")
for i,j in enumerate(pca_score):
    print(f"PC{i+1}   {j*100:.1f}%")

**Standard score plots**

In [None]:
X_use = X_gen_considered_pca
X_ids = X_gen_considered_ids

X_subset = X_gen_chosen_pca
X_subset_ids = X_gen_chosen_ids

f_ind_1 = 0
f_ind_2 = 1
x_min, x_max = X_use[:, f_ind_1].min() - 2, X_use[:, f_ind_1].max() + 2
y_min, y_max = X_use[:, f_ind_2].min() - 2, X_use[:, f_ind_2].max() + 2
plt.figure(figsize=(10,10))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=12.5) 
plt.yticks(fontsize=12.5)
plt.xlabel(f"PC{f_ind_1+1}",fontsize=15)
plt.ylabel(f"PC{f_ind_2+1}",fontsize=15)
plt.locator_params(axis='y', nbins=10)
plt.locator_params(axis='x', nbins=10)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max) 
plt.scatter(X_use[:,f_ind_1], X_use[:,f_ind_2],c='#B1B1B1',alpha=0.6,marker=".",s=750,edgecolor='white')
plt.scatter(X_subset[:,f_ind_1], X_subset[:,f_ind_2],c='#000080',alpha=0.8,marker=".",s=750,edgecolor='white')

### plotting labels for main set
#for i in range(0,len(X_use)):
#    plt.text(X_use[i,f_ind_1],X_use[i,f_ind_2]," "+str(X_ids[i]),color="blue",fontsize=10,ha="left") 

### plotting labels for subset
#for i in range(0,len(X_subset)):
#    plt.text(X_subset[i,f_ind_1],X_subset[i,f_ind_2]," "+str(X_subset_ids[i]),color="red",fontsize=10,ha="right") 

plt.tight_layout()
plt.show()
#plt.savefig("pca.png",dpi=300)

**K-means clustering analysis and score plot**

In [None]:
X_use = X_gen_considered_pca
X_main_ids = X_gen_considered_ids

nclusters = 47

X_subset = X_gen_chosen_pca
X_subset_ids = X_gen_chosen_ids

####################################################################################

X_kmeans = X_use
kmeans = KMeans(n_clusters=nclusters, random_state=42).fit(X_kmeans)

dists = []
for x in range(len(X_kmeans)):
    delta = [X_kmeans[x,i]-kmeans.cluster_centers_[kmeans.labels_[x],i] for i in range(len(X_kmeans[0]))]
    dist = np.linalg.norm(delta)
    dists.append(dist)

clusters = {}
clusterdists = {}
clustermins = []
clusterorders_dict = {}
clusterorders = np.zeros(X_kmeans.shape[0])

for x in range(nclusters):
    clusters[x] = [i for i in range(len(kmeans.labels_)) if kmeans.labels_[i] == x]
    clusterdists[x] = [dists[i] for i in clusters[x]]
    clustermins.append(clusters[x][clusterdists[x].index(min(clusterdists[x]))])
    clusterorders_dict[x] = [y for _,y in sorted(zip(clusterdists[x],clusters[x]))]
    for i in clusters[x]:
        clusterorders[i] = clusterorders_dict[x].index(i)+1

main_row_id = list(enumerate(X_main_ids)) 
clusters_with_ids = {}
for i in range(0,len(clusters)):
    emptylist = []
    for j in range(0,len(clusters[i])):
        #print('row and id = ',main_row_id[clusters[i][j]])
        emptylist.append(main_row_id[clusters[i][j]][1])
    clusters_with_ids[i] = emptylist

centroids_with_ids = []
for j in clustermins:
    for i in range(0,len(main_row_id)):
        if main_row_id[i][0] == j:
            centroids_with_ids.append(main_row_id[i][1])

####################################################################################
        
f_ind_1 = 0
f_ind_2 = 1
x_min, x_max = X_kmeans[:, f_ind_1].min() - 2, X_kmeans[:, f_ind_1].max() + 2
y_min, y_max = X_kmeans[:, f_ind_2].min() - 2, X_kmeans[:, f_ind_2].max() + 2
plt.figure(figsize=(10,10))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=10) 
plt.yticks(fontsize=10)
plt.xlabel(r"PC1",fontsize=12.5)
plt.ylabel(r"PC2",fontsize=12.5)
plt.locator_params(axis='y', nbins=8)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
mapping = kmeans.labels_
cMap = "tab20c"
plt.scatter(X_kmeans[:,f_ind_1], X_kmeans[:,f_ind_2],c=mapping,cmap=cMap,alpha=0.6,marker=".",s=750,edgecolor='white')

### plot ligand ids
#for i in range(0,nclusters):
#    for j in range(0,len(clusters_with_ids[i])):
#        plt.text(X_use[clusters[i][j],f_ind_1],X_use[clusters[i][j],f_ind_2]," "+str(clusters_with_ids[i][j]),color="blue",fontsize=12,ha="left")

### highlight centroids
#plt.scatter(X_use[clustermins,f_ind_1],X_use[clustermins,f_ind_2],c=range(nclusters),cmap=cMap,alpha=1,marker="D",s=60,edgecolor="black",linewidth=1)

### plot the subset
#plt.scatter(X_subset[:,f_ind_1], X_subset[:,f_ind_2],c='#000080',alpha=0.8,marker="D",s=150,edgecolor='white')

plt.tight_layout()
#plt.savefig("clustering.png",dpi=300)
plt.show()

**3D score plot**

In [None]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import # Fixing random state for reproducibility
np.random.seed(19680801)
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
mapping = kmeans.labels_
cMap = "tab20c"
ax.scatter(X_kmeans[:,0],X_kmeans[:,1],X_kmeans[:,2],c=mapping,cmap=cMap,alpha=0.6,marker=".",s=300,edgecolor='white')
#ax.scatter(X_kmeans[clustermins,0],X_kmeans[clustermins,1],X_kmeans[clustermins,2],c=range(nclusters), marker="D",s=20,alpha=1,cmap=cMap,edgecolor="black",linewidth=0.5)
ax.scatter(X_subset[:,0],X_subset[:,1],X_subset[:,2],c='#000080',alpha=0.8,marker="D",s=50,edgecolor='white')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
#plt.savefig("clustering 3D.png",dpi=300)
plt.show()

**Printing the centroids**

In [None]:
centroid_smiles = np.array(inp2['SMILES'].loc[centroids_with_ids])
count=0
subms = []
for i in centroid_smiles:
    x = Chem.MolFromSmiles(i)
    count+=1
    subms.append(x)

clusters_and_centroidswithids = list(enumerate(centroids_with_ids))   
my_labels = []
for i in range(0,len(clusters_and_centroidswithids)):
    x = 'cluster '+str(clusters_and_centroidswithids[i][0])+'ligand id '+str(clusters_and_centroidswithids[i][1])
    my_labels.append('cluster '+str(clusters_and_centroidswithids[i][0])+' ligand id '+str(clusters_and_centroidswithids[i][1]))
    
molsPerRow = 5
subImgSize= (400,400)
nRows = -(count // -molsPerRow)
fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])
img = rdMolDraw2D.MolDraw2DCairo(fullSize[0],fullSize[1],subImgSize[0], subImgSize[1])
img.drawOptions().legendFontSize=30
img.DrawMolecules(subms[:],legends=my_labels[:])
img.SetFontSize(100)
img.FinishDrawing()
with open('name.png','wb+') as wfp:             # opening a png file, goes into current folder
    wfp.write(img.GetDrawingText())             # saving in the image
Image.open('name.png')                          # opening it in the notebook

**Printing all the ligands in a cluster**

In [None]:
cluster = 2

###########################################################################

X_main_smiles = np.array(inp2['SMILES'].loc[X_main_ids])
mainset_row_smiles = list(enumerate(X_main_smiles))
clusters_with_smiles = {}
for i in range(0,len(clusters)):
    emptylist = []
    for j in range(0,len(clusters[i])):
        emptylist.append(mainset_row_smiles[clusters[i][j]][1])
    clusters_with_smiles[i] = emptylist
clusters_to_use = clusters_with_smiles
clusters_to_use = dict( [(k,v) for k,v in clusters_to_use.items() if len(v)>0]) # removes empty keys
count=0
subms = []
for j in clusters_to_use[cluster]:
    x = Chem.MolFromSmiles(j)
    count+=1
    subms.append(x)
my_labels = []  
for j in range(0,len(clusters_with_ids[cluster])):
    x = 'cluster '+str(cluster)+' id %d'%(clusters_with_ids[cluster][j])
    my_labels.append(x)
molsPerRow = 4
subImgSize= (400,400)
nRows = -(count // -molsPerRow)
fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])
img = rdMolDraw2D.MolDraw2DCairo(fullSize[0],fullSize[1],subImgSize[0], subImgSize[1])
img.drawOptions().legendFontSize=25
img.DrawMolecules(subms[:],legends=my_labels[:])
img.SetFontSize(100)
img.FinishDrawing()
with open('name.png','wb+') as wfp:
    wfp.write(img.GetDrawingText())
Image.open('name.png')