# Initialization

In [None]:
import pandas as pd
import numpy as np
import umap
import umap.plot
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from bokeh.plotting import figure, show, save, output_notebook, output_file
randomstate = 42

# Load data

In [None]:
ci = pd.read_excel("baran_acid_descriptors.xlsx","Sheet1",index_col=0,header=0,engine='openpyxl')
compinp = ci[ci.columns[0:]].loc[ci.index[0:]]
compinp.index = compinp.index.astype(int)
compinp.dropna(axis=0,inplace=True)

inp2 = pd.read_excel("baran_acid_identifiers.xlsx","Sheet1",index_col=1,header=2,engine='openpyxl')
inp2.index = inp2.index.astype(int)

**Scale the data**

In [None]:
X_all = np.array(compinp)
X_screened = np.array(compinp.loc[inp2["screened"]==1])
X_best = np.array(compinp.loc[inp2["best mediators"]==1])
X_first = np.array(compinp.loc[inp2["first"]==1])
X_second = np.array(compinp.loc[inp2["second"]==1])
X_third = np.array(compinp.loc[inp2["third"]==1])
X_rest = np.array(compinp.loc[inp2["rest"]==1])

X_all_ids = np.array(compinp.index)
X_screened_ids = np.array(compinp.loc[inp2["screened"]==1].index)
X_best_ids = np.array(compinp.loc[inp2["best mediators"]==1].index)
X_first_ids = np.array(compinp.loc[inp2["first"]==1].index)
X_second_ids = np.array(compinp.loc[inp2["second"]==1].index)
X_third_ids = np.array(compinp.loc[inp2["third"]==1].index)
X_rest_ids = np.array(compinp.loc[inp2["rest"]==1].index)

X_all_names = np.array(inp2["name"].loc[X_all_ids])
X_screened_names = np.array(inp2['name'].loc[X_screened_ids])
X_best_names = np.array(inp2['name'].loc[X_best_ids])
X_first_names = np.array(inp2['name'].loc[X_first_ids])
X_second_names = np.array(inp2['name'].loc[X_second_ids])
X_third_names = np.array(inp2['name'].loc[X_third_ids])
X_rest_names = np.array(inp2['name'].loc[X_rest_ids])

X_all_smiles = np.array(inp2['SMILES'].loc[X_all_ids])

X_screen_results = np.array(inp2['screen results'].loc[X_screened_ids])
X_binary_results = np.array(inp2['screen results binary'].loc[X_best_ids])
X_best_results = np.array(inp2['best results'].loc[X_best_ids])

scaler = StandardScaler()
scaler.fit(X_all)
X_all_sc = scaler.transform(X_all)
X_screened_sc = scaler.transform(X_screened)
X_best_sc = scaler.transform(X_best)
X_first_sc = scaler.transform(X_first)
X_second_sc = scaler.transform(X_second)
X_third_sc = scaler.transform(X_third)
X_rest_sc = scaler.transform(X_rest)

# Standard score plots

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids

reducer = umap.UMAP()
reducer.fit(X_main_sc)
X_all_red = reducer.transform(X_main_sc)

f_ind_1 = 0
f_ind_2 = 1

x_min, x_max = X_all_red[:, f_ind_1].min() - 2, X_all_red[:, f_ind_1].max() + 2
y_min, y_max = X_all_red[:, f_ind_2].min() - 2, X_all_red[:, f_ind_2].max() + 2
plt.figure(figsize=(8.5,8))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=12.5) 
plt.yticks(fontsize=12.5)
plt.xlabel(f"dimension {f_ind_1+1}",fontsize=12.5)
plt.ylabel(f"dimension {f_ind_2+1}",fontsize=12.5)
plt.locator_params(axis='y', nbins=10)
plt.locator_params(axis='x', nbins=10)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)

plt.scatter(X_all_red[:,f_ind_1], X_all_red[:,f_ind_2],c="grey",alpha=1,marker=".",s=10)

plt.tight_layout()
plt.show()  
#plt.savefig("new1.png",dpi=300)

**Plot two sets**

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids
X_subset_ids = X_screened_ids

reducer = umap.UMAP()
reducer.fit(X_main_sc)
X_all_red = reducer.transform(X_main_sc)

f_ind_1 = 0
f_ind_2 = 1

x_min, x_max = X_all_red[:, f_ind_1].min() - 2, X_all_red[:, f_ind_1].max() + 2
y_min, y_max = X_all_red[:, f_ind_2].min() - 2, X_all_red[:, f_ind_2].max() + 2
plt.figure(figsize=(10,8))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=12.5) 
plt.yticks(fontsize=12.5)
plt.xlabel(f"dimension {f_ind_1+1}",fontsize=15)
plt.ylabel(f"dimension {f_ind_2+1}",fontsize=15)
plt.locator_params(axis='y', nbins=10)
plt.locator_params(axis='x', nbins=10)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)

main_set = []
subset = []

for i in range(0,len(X_main_ids)):
    if X_main_ids[i] in X_subset_ids:
        x = X_all_red[i].tolist()
        subset.append(x)
    else:
        x = X_all_red[i].tolist()
        main_set.append(x)

main_set = np.asarray(main_set)
subset = np.asarray(subset)

# plot the main set as grey
plt.scatter(main_set[:,f_ind_1], main_set[:,f_ind_2],c='#E0E0E0',alpha=0.5,marker=".",s=100) 

# plot the subset as blue
#plt.scatter(subset[:,f_ind_1], subset[:,f_ind_2],c='#000080',alpha=1,marker='.',s=150,edgecolor='white') 

# plot the subset using a colormap based on parameter values (KAWA)
x = compinp['DPFE']
y = list(x[0:60])

#x = inp2['screen results']
#y = x[0:60]

mapping = y
plt.scatter(subset[:,f_ind_1], subset[:,f_ind_2],c=mapping,cmap='plasma',alpha=1,marker='.',s=250,edgecolor='white') 

cbar = plt.colorbar()
cbar.set_label('DPFE (kJ/mol)',rotation=90,size=15,labelpad=20)
cbar.ax.tick_params(labelsize=12.5)

plt.tight_layout()
#plt.show()  
#plt.savefig("dpfe.png",dpi=300)

**Plot multiple sets**

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids
X_subset1_ids = X_first_ids
X_subset2_ids = X_second_ids
X_subset3_ids = X_third_ids
X_subset4_ids = X_rest_ids

reducer = umap.UMAP()
reducer.fit(X_main_sc)
X_all_red = reducer.transform(X_main_sc)

f_ind_1 = 0
f_ind_2 = 1

x_min, x_max = X_all_red[:, f_ind_1].min() - 2, X_all_red[:, f_ind_1].max() + 2
y_min, y_max = X_all_red[:, f_ind_2].min() - 2, X_all_red[:, f_ind_2].max() + 2
plt.figure(figsize=(8.5,8))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=15) 
plt.yticks(fontsize=15)
plt.xlabel(f"dimension {f_ind_1+1}",fontsize=12.5)
plt.ylabel(f"dimension {f_ind_2+1}",fontsize=12.5)
plt.locator_params(axis='y', nbins=10)
plt.locator_params(axis='x', nbins=10)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
    
main_set = []
subset1 = []
subset2 = []
subset3 = []
subset4 = []

for i in range(0,len(X_main_ids)):
    if X_main_ids[i] in X_subset1_ids:
        x = X_all_red[i].tolist()
        subset1.append(x)
    if X_main_ids[i] in X_subset2_ids:
        x = X_all_red[i].tolist()
        subset2.append(x)
    if X_main_ids[i] in X_subset3_ids:
        x = X_all_red[i].tolist()
        subset3.append(x)
    if X_main_ids[i] in X_subset4_ids:
        x = X_all_red[i].tolist()
        subset4.append(x)
    else:
        x = X_all_red[i].tolist()
        main_set.append(x)

        
main_set = np.asarray(main_set)
subset1 = np.asarray(subset1)
subset2 = np.asarray(subset2)
subset3 = np.asarray(subset3)
subset4 = np.asarray(subset4)

# plot the main set as grey
plt.scatter(main_set[:,f_ind_1], main_set[:,f_ind_2],c='#E0E0E0',alpha=0.3,marker=".",s=150) 

# plot the subset as different colors (reversed to get "first" on top) 
plt.scatter(subset4[:,f_ind_1], subset4[:,f_ind_2],label='<2.0',c='#FFB75A',alpha=1,marker='.',s=300,edgecolor='white')
plt.scatter(subset3[:,f_ind_1], subset3[:,f_ind_2],label='2.4',c='#0000FF',alpha=1,marker='.',s=300,edgecolor='white')
plt.scatter(subset2[:,f_ind_1], subset2[:,f_ind_2],label='2.7',c='#CC0000',alpha=1,marker='.',s=300,edgecolor='white')
plt.scatter(subset1[:,f_ind_1], subset1[:,f_ind_2],label='5.0',c='#00CC00',alpha=1,marker='.',s=300,edgecolor='white') 

plt.legend(loc='upper right',fontsize=12.5,title='selectivity',title_fontsize=15)

plt.tight_layout()
#plt.show()  
#plt.savefig("reactivity2.png",dpi=300)

**3D plot**

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids
X_subset1_ids = X_rest_ids
X_subset2_ids = X_third_ids
X_subset3_ids = X_second_ids
X_subset4_ids = X_first_ids

reducer = umap.UMAP(n_components=3, random_state=42)
reducer.fit(X_main_sc)
X_all_red = reducer.transform(X_main_sc)

main_set = []
subset1 = []
subset2 = []
subset3 = []
subset4 = []

for i in range(0,len(X_main_ids)):
    if X_main_ids[i] in X_subset1_ids:
        x = X_all_red[i].tolist()
        subset1.append(x)
    if X_main_ids[i] in X_subset2_ids:
        x = X_all_red[i].tolist()
        subset2.append(x)
    if X_main_ids[i] in X_subset3_ids:
        x = X_all_red[i].tolist()
        subset3.append(x)
    if X_main_ids[i] in X_subset4_ids:
        x = X_all_red[i].tolist()
        subset4.append(x)
    else:
        x = X_all_red[i].tolist()
        main_set.append(x)
        
main_set = np.asarray(main_set)
subset1 = np.asarray(subset1)
subset2 = np.asarray(subset2)
subset3 = np.asarray(subset3)
subset4 = np.asarray(subset4)

from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import # Fixing random state for reproducibility

np.random.seed(19680801)
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(main_set[:,0],main_set[:,1],main_set[:,2], marker=".",s=150,alpha=0.005,c='grey')

ax.scatter(subset1[:,0],subset1[:,1],subset1[:,2],c='#FFCC99',alpha=0.5,marker=".",s=200,edgecolor="white",linewidth=0.5)
ax.scatter(subset2[:,0],subset2[:,1],subset2[:,2],c='#0000FF',alpha=1,marker=".",s=200,edgecolor="white",linewidth=0.5)
ax.scatter(subset3[:,0],subset3[:,1],subset3[:,2],c='#CC0000',alpha=1,marker=".",s=200,edgecolor="white",linewidth=0.5)
ax.scatter(subset4[:,0],subset4[:,1],subset4[:,2],c='#00CC00',alpha=1,marker=".",s=200,edgecolor="white",linewidth=0.5)

ax.xaxis.pane.set_color('black')
ax.yaxis.pane.set_color('black')
ax.zaxis.pane.set_color('black')

ax.xaxis.pane.set_alpha(0.03)
ax.yaxis.pane.set_alpha(0.03)
ax.zaxis.pane.set_alpha(0.03)

ax.grid(False)

ax.set_xlabel('dimension 1')
ax.set_ylabel('dimension 2')
ax.set_zlabel('dimension 3')

#plt.savefig("umap 3d.png",dpi=300)
plt.show()


# Interactive plots

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids
X_main_names = X_all_names

embedding = umap.UMAP().fit(X_main_sc)
hover_df = pd.DataFrame({'ID':X_main_ids,'Name':X_main_names})
int_plot = umap.plot.interactive(embedding, hover_data=hover_df)

show(int_plot)

**Color based on a subset**

In [None]:
X_main_sc = X_all_sc
X_main_ids = X_all_ids
X_main_names = X_all_names
X_main_smiles = X_all_smiles
X_subset_ids = X_screened_ids

embedding = umap.UMAP().fit(X_main_sc)

X_subset_ids_TF = []
for i in X_main_ids:
    if i in X_subset_ids:
        X_subset_ids_TF.append(True)
    else:
        X_subset_ids_TF.append(False)

hover_df = pd.DataFrame({'ID':X_main_ids,'Name':X_main_names,'Subset':X_subset_ids_TF,'SMILES':X_main_smiles})
int_plot = umap.plot.interactive(embedding, theme='viridis', labels=hover_df['Subset'], hover_data=hover_df)

show(int_plot)