In [None]:
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pandas as pd
import rdkit.Chem.Descriptors as Descriptors
from rdkit import Chem
import numpy as np
from scipy.stats import pearsonr
from scipy.special import kl_div
import re

import seaborn as sns
import matplotlib.lines as mlines
from scipy.stats import gaussian_kde
import math

# Plot for Methods

In [None]:
plt.figure(figsize=(12,6), dpi=500)
fontsize = 12


plt.subplot(221)
df = pd.read_csv('input_csv_path')

element_dict = {'B': 10,'C': 2142,'N': 531,'O': 700,'F': 148,'Si': 55,'P': 46,'S': 191,'Cl': 141, 'Ge': 2,'As': 1, 'Se': 6,'Br': 88,'I': 51}

plt.hist(df['IE'], bins=np.arange(5, 15,0.25),color='lightblue', ec='black')
plt.ylabel('Number of Chemicals', fontsize=fontsize)
plt.xlabel('IE / eV', fontsize=fontsize)
plt.xlim([5,15])

plt.subplot(222)
weight = []
for smi in df['smiles']:
    weight.append(Descriptors.ExactMolWt(Chem.MolFromSmiles(smi)))

plt.hist(weight, bins=np.arange(0, 700, 10), color='lightblue', ec='black')
plt.xticks(fontsize=fontsize)
plt.ylabel('Number of Chemicals', fontsize=fontsize)
plt.xlabel('Relative Molecular Weight', fontsize=fontsize)
plt.xlim([0,700])

plt.subplot(212)
cat_path = 'catagorized_img_path'
cat_dict = {}
for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)

x = [] # name of functional group
y = [] # count of molecules with the functional group
for key,value in cat_dict.items():
    x.append(key)
    y.append(len(value))

plt.bar(x,y,color='lightblue', ec='black')
plt.ylim([0,320])
plt.ylabel('Number of Chemicals', fontsize=fontsize)
plt.xticks(rotation=90,fontsize=fontsize)
plt.xlim([-1,54])
x0 = -1
for xi,yi in zip(x,y):
    x0+=1
    plt.text(x0,yi+2,yi, ha='center', va='bottom',fontsize=7)

plt.subplots_adjust(wspace=0.2,hspace=0.3)
plt.tight_layout()
plt.savefig('output_png_path',dpi=500)
plt.show()

# Plot for Metrics of Three datasets

In [None]:
ie = pd.read_csv('input_ie_csv_path')
lipo = pd.read_csv('input_lipophilicity_csv_path')
free = pd.read_csv('input_freesolv_csv_path')
print('Data\tMAX\tMIN\tMean\tMedian')
print('IE',max(ie['IE']),min(ie['IE']),np.mean(ie['IE']), np.median(ie['IE']))
print('Lipophilicity',max(lipo['lipophilicity']),min(lipo['lipophilicity']),np.mean(lipo['lipophilicity']), np.median(lipo['lipophilicity']))
print('Freesolv',max(free['exp']),min(free['exp']),np.mean(free['exp']), np.median(free['exp']))


# plot Distribution of Target
plt.figure(figsize=(12,4))
for i in range(3):
    plt.subplot(1,3,i+1)
    if i==0:
        plt.hist(ie['IE'], bins=np.arange(5, 15,0.25),color='lightblue', ec='black')
        plt.xlabel('IE / eV', fontsize=12)
        plt.xlim([5,15])
    elif i==1:
        plt.hist(lipo['lipophilicity'],bins=np.arange(-2, 5,0.175),color='lightblue', ec='black')
        plt.xlabel('LogD', fontsize=12)
        plt.xlim([-2,5])
    elif i==2:
        plt.hist(free['exp'],bins=np.arange(-26, 1,0.675), color='lightblue', ec='black')
        plt.xlabel('Hydration Free Energies / kcal/mol', fontsize=12)
        plt.xlim([-26,1])
    plt.ylabel('Number of Chemicals', fontsize=12)
plt.tight_layout()
plt.savefig('output_png_path',dpi=328)
plt.show()
# plot MW
plt.figure(figsize=(12,4))
for i in range(3):
    plt.subplot(1,3,i+1)
    if i==0:
        weight = []
        for smi in ie['smiles']:
            weight.append(Descriptors.ExactMolWt(Chem.MolFromSmiles(smi)))
        print(max(weight))
        plt.hist(weight,bins=np.arange(0, 650,16.25),color='lightblue', ec='black')
        plt.xlabel('Molecular Weight', fontsize=12)
        plt.xlim([0,650])
    elif i==1:
        weight = []
        for smi in lipo['smile']:
            weight.append(Descriptors.ExactMolWt(Chem.MolFromSmiles(smi)))
        print(max(weight))
        plt.hist(weight,bins=np.arange(100, 1700,40), color='lightblue', ec='black')
        plt.xlabel('Molecular Weight', fontsize=12)
        plt.xlim([100,1700])
    elif i==2:
        weight = []
        for smi in free['smile']:
            weight.append(Descriptors.ExactMolWt(Chem.MolFromSmiles(smi)))
        print(max(weight))
        plt.hist(weight,bins=np.arange(0,500,12.5),color='lightblue', ec='black')
        plt.xlabel('Molecular Weight', fontsize=12)
        plt.xlim([0,500])
    plt.ylabel('Number of Chemicals', fontsize=12)
plt.tight_layout()
plt.savefig(('output_png_path',dpi=328)
plt.show()

# plot Functional group distribution
plt.figure(figsize=[15,5])
cat_path = 'catagorized_images_path'
cat_dict = {}
for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)

x = [] # name of functional group
y = [] # count of molecules with the functional group
for key,value in cat_dict.items():
    x.append(key)
    y.append(len(value))

plt.bar(x,y,color='lightblue', ec='black')
plt.ylim([0,320])
plt.ylabel('Number of Chemicals', fontsize=12)
plt.xticks(rotation=90)
plt.xlim([-1,54])
x0 = -1
for xi,yi in zip(x,y):
    x0+=1
    plt.text(x0,yi+2,yi, ha='center', va='bottom')
plt.tight_layout()
plt.savefig('output_png_path',dpi=328)
plt.show()

# plot for datapreparation

In [None]:
df = pd.read_csv('input_csv_path')
plt.figure(figsize=(8,4))
x = [i for i in range(6)]
plt.xticks(x,[i for i in df.columns[1:]])
plt.plot(x, df.iloc[:1, 1:].values[0], 'm.-', color='b')
plt.plot(x, df.iloc[1:2, 1:].values[0], 'm.-', color='y')
plt.plot(x, df.iloc[2:3, 1:].values[0], 'm.-', color='g')
plt.plot(x, df.iloc[3:4, 1:].values[0], 'm.-', color='r')
plt.plot(x, df.iloc[4:5, 1:].values[0], 'm.-', color='cyan')
plt.plot(x, df.iloc[5:, 1:].values[0], 'm.-', color='brown')
plt.ylabel('RMSE')
plt.legend([i for i in df['Preparation']])
plt.tight_layout()
plt.savefig('output_png_path', dpi=328)
plt.show()

# plot for descriptors linear correlation to the prediction (descriptor to Experiment IE)

In [None]:
descriptors = ['CIC0', 'TIC1', 'TIC2', 'Sp','ATSC0v',
               'nBonds', 'ATSC1d', 'SMR', 'ATS1i', 'nC',
               'TIC3', 'nAtom', 'Sv', 'ATS1v',  'TpiPC10', 
               'SRW02', 'ATS1p', 'VMcGowan', 'TIC4', 'piPC4']

dataset = pd.read_csv('input_csv_path')
y = dataset['IE']


plt.figure(figsize=(15, 25), dpi=328)
for i in range(20):
    plt.subplot(5, 4, i+1)
    descriptor = descriptors[i]
    x = dataset[descriptor]
    plt.scatter(x, y, s=2)
    plt.legend(labels=[descriptor], fontsize=18)
    if (i+1) % 4 == 1:
        plt.ylabel('Experimental IE / eV')
    else:
        plt.yticks([])
    linear_model=np.polyfit(x,y,1)
    linear_model_fn=np.poly1d(linear_model)
    x_tick=np.arange(min(x),max(x)+1)
    plt.plot(x_tick,linear_model_fn(x_tick),color='red')

plt.subplots_adjust(wspace=0, hspace=0.1)
plt.savefig('output_png_path', bbox_inches='tight')
plt.show()

# Plot for RMSE Histogram

# Plot for RMSE in the function of functional groups with their distribution

In [None]:
cat_path = 'catagorized_image_path'
cat_dict = {}

for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)


ml_result = pd.read_excel('input_excel_path',sheet_name='Name_of_machine_learning_method')

ref_df = pd.read_csv('input_full_dataset_csv_path')
metrics = {}

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

sum_of_compounds = sum([len(j) for i,j in cat_dict.items()])
print(sum_of_compounds)

for fg, cas in cat_dict.items():
    exp = []
    pred = []
    try:
        for record in ml_result['CAS Link']:
            if record in cas:
                exp.append(ml_result.loc[ml_result['CAS Link']==record]['IE'].values[0])
                pred.append(ml_result.loc[ml_result['CAS Link']==record]['Predicted IE'].values[0])
        exp, pred = np.array(exp), np.array(pred)
        if len(cas) == 1:
            metrics[fg] = [rmse(pred, exp), 0, sum(kl_div(exp, pred)), len(cas)/sum_of_compounds] 
            continue
        # metrics = {functional groups: [RMSE, R2, Kullback–Leibler divergence, fraction of compounds]}
        metrics[fg] = [rmse(pred, exp), np.square(pearsonr(exp, pred)[0]), sum(kl_div(exp, pred)), len(cas)/sum_of_compounds] 
    except:
        for smiles in ml_result['SMILES']:
            record = ref_df.loc[ref_df['smiles']==smiles]['CAS Link'].values[0]
            if record in cas:
                exp.append(ml_result.loc[ml_result['SMILES']==smiles]['LABELS'].values[0])
                pred.append(ml_result.loc[ml_result['SMILES']==smiles]['PREDICTIONS'].values[0])
        exp, pred = np.array(exp), np.array(pred)
        if len(cas) == 1:
            metrics[fg] = [rmse(pred, exp), 0, sum(kl_div(exp, pred)), len(cas)/sum_of_compounds] 
            continue
        metrics[fg] = [rmse(pred, exp), np.square(pearsonr(exp, pred)[0]), sum(kl_div(exp, pred)), len(cas)/sum_of_compounds]

metrics = pd.DataFrame(metrics)
metrics


In [None]:
metrics = metrics.sort_values(by=3, axis=1)

In [None]:
fgs_rmse =  metrics.loc[[0]].values[0]
fgs_R2 =  metrics.loc[[1]].values[0]
fgs_kl =  metrics.loc[[2]].values[0]
fgs_fraction = metrics.loc[[3]].values[0]

fgs = metrics.columns

plt.figure(figsize=(10, 15), dpi=500)

plt.subplot(1, 2, 1)
plt.ylim([-1,54])
plt.xlim([0.14,0])
plt.barh(fgs, fgs_fraction, color='orange', ec='black')
plt.xlabel('Fraction of Function Groups')

plt.subplot(1, 2, 2)
plt.ylim([-1,54])
plt.barh(fgs,fgs_rmse, color='lightblue', ec='black')
y0 = -1.4
for xi,yi in zip(fgs_rmse,fgs):
    xi = round(xi,2)
    y0+=1
    plt.text(xi+0.125,y0,xi, ha='center', va='bottom')

plt.plot(fgs_R2,fgs, color='red')
plt.plot(fgs_kl,fgs, color='green')
plt.xlim([0, 2.5])
plt.legend(['Pearon R square', 'K-L Divergence','RMSE'], loc='center right')
plt.yticks([])
plt.subplots_adjust(wspace=0.05, hspace=0.1)
plt.xlabel('RMSE / eV')
plt.savefig('output_png_path', bbox_inches='tight',dpi=500)
plt.show()

## Plot for RMSE of a particular group against ML models

In [None]:
cat_path = 'catagorized_image_path'
cat_dict = {}

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)

plot_fg = ['alkene', 'radical']

algorithms = ['RF', 'ET', 'Bagging', 'SVR','kNN','XGB', 'MLP', 'NN', 'GCN', 'MPNN', 'GAT', 'AttentiveFP', 'Weave', 'NF']

ref_df = pd.read_csv('input_full_dataset_csv_path')

In [None]:
i = 0
for fg in plot_fg:
    print(fg)
    plt.figure(figsize=(6,5), dpi=500)
    
    smiles = []
    cas = cat_dict[fg]
    rmse_algo = []
    for c in cas:
        smiles.append(ref_df.loc[ref_df['CAS Link']==c]['smiles'].values[0])
    for algorithm in algorithms:
        if algorithm in ['RF', 'ET', 'Bagging', 'SVR','kNN','XGB', 'MLP', 'NN']:
            ml_result = pd.read_excel('input_excel_path', sheet_name=algorithm)
            exp = []
            pred = []
            for c in cas:
                exp.append(float(ml_result.loc[ml_result['CAS Link']==c]['IE'].values[0]))
                pred.append(float(ml_result.loc[ml_result['CAS Link']==c]['Predicted IE'].values[0]))
            rmse_algo.append(rmse(np.array(pred),np.array(exp)))
        else:
            ml_result = pd.read_csv('input_csv_path_{}.csv'.format(algorithm))
            exp = []
            pred = []
            for smi in smiles:
                try:
                    exp.append(float(ml_result.loc[ml_result['SMILES']==smi]['LABELS'].values[0]))
                    pred.append(float(ml_result.loc[ml_result['SMILES']==smi]['PREDICTIONS'].values[0]))
                except:
                    continue
            rmse_algo.append(rmse(np.array(pred),np.array(exp)))
    plt.ylim([0,2])
    plt.ylabel('RMSE',fontsize=12)
    plt.xticks(rotation=60,fontsize=12,color='white')
    lg = 'lightgreen'
    lb = 'lightblue'
    color = [lg, lg, lg, lg, lg, lg, lg, lg, lb, lb, lb, lb, lb, lb]
    plt.bar(algorithms,rmse_algo, color=color, ec='black')

    plt.axhline(y=np.mean(rmse_algo),color="red")
    plt.legend(['Mean RMSE = {}'.format(round(np.mean(rmse_algo),2)), 'RMSE of Algorithms'],loc='upper right',fontsize=12)
    plt.tight_layout()
    plt.savefig('output_png_path_{}.png'.format(fg),bbox_inches='tight',dpi=500)
    plt.show()

## Plot the functional groups coloring the predictions vs labels scatter plot

In [None]:
def plot_scatter(predict, experiment, regressor,line_split, property, cat_dict):
    values = [int(i*250/len(cat_dict.keys())) for i in range(len(cat_dict.keys()))]
    colors = ["#%02x%02x%02x"%(200,int(g),40)for g in values]
    colors.reverse()
    num_list = []
    num_list.extend(predict)
    num_list.extend(experiment)
    mini=math.floor(min(num_list))
    maxi=math.ceil(max(num_list))
    fig=plt.figure(figsize=(10,10))
    ax = fig.add_subplot()
    ref_df = pd.read_csv('input_full_dataset_path')
    color_dict = {}
    for fg,cas in cat_dict.items():
        color_dict[fg] = [len(cas)]
    color_df = pd.DataFrame(color_dict)
    color_df = color_df.sort_values(by=0, axis=1)

    color_sort_dict = {}
    for i in range(len(cat_dict.keys())):
        color_sort_dict[color_df.columns[i]] = colors[i]

    ml_result = pd.read_csv('input_result_csv_path{}_10fold.csv'.format(regressor))

    for fg,cas in cat_dict.items():
        smiles = []
        for c in cas:
            smiles.append(ref_df.loc[ref_df['CAS Link']==c]['smiles'].values[0])
        exp = []
        pred = []
        for smi in smiles:
            try:
                exp.append(float(ml_result.loc[ml_result['SMILES']==smi]['LABELS'].values[0]))
                pred.append(float(ml_result.loc[ml_result['SMILES']==smi]['PREDICTIONS'].values[0]))
            except:
                continue
        ax.scatter(exp,pred,s=20, c=color_sort_dict[fg])


            
    line = mlines.Line2D([mini,maxi], [mini, maxi], color='red')
    line1 = mlines.Line2D([mini,maxi - line_split], [mini + line_split, maxi], color='red')
    line2 = mlines.Line2D([mini + line_split,maxi], [mini, maxi - line_split], color='red')
    ax.add_line(line)
    ax.add_line(line1)
    ax.add_line(line2)
    plt.xlabel("Experimental {}".format(property))
    plt.ylabel("Predicted {}".format(property))
    plt.xlim(mini,maxi)
    plt.ylim(mini,maxi)
    plt.tight_layout()
    plt.savefig('output_png_path_{}.png'.format(regressor), dpi=328)
    plt.show()


cat_path = 'catagorized_images_path'
cat_dict = {}

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)

algorithm = 'AttentiveFP'
ml_result = pd.read_csv('input_result_csv_path_{}_10fold.csv'.format(algorithm))
exp = ml_result['LABELS']
pred = ml_result['PREDICTIONS']
plot_scatter(pred,exp,algorithm,1.,'IE', cat_dict)

## plot the scatter for GNNs

In [None]:
def plot_scatter(predict, experiment, regressor,line_split, property):
    num_list = []
    num_list.extend(predict)
    num_list.extend(experiment)
    mini=math.floor(min(num_list))
    maxi=math.ceil(max(num_list))
    fig=plt.figure(figsize=(5,5)) 
    ax = fig.add_subplot()
    plt.scatter(experiment,predict,s=3,c='black')
    line = mlines.Line2D([mini,maxi], [mini, maxi], color='red')
    line1 = mlines.Line2D([mini,maxi - line_split], [mini + line_split, maxi], color='red')
    line2 = mlines.Line2D([mini + line_split,maxi], [mini, maxi - line_split], color='red')
    ax.add_line(line)
    ax.add_line(line1)
    ax.add_line(line2)
    plt.xlabel("Experimental {}".format(property))
    plt.ylabel("Predicted {}".format(property))
    plt.xlim(5,15)
    plt.ylim(5,15)
    plt.tight_layout()
    plt.savefig('output_png_path{}_10foldcv.png'.format(regressor), dpi=500)
    plt.show()

for i in ['AttentiveFP', 'GAT', 'GCN', 'MPNN', 'NF', 'Weave']:
    csv_dir = 'input_result_csv_path' + '{}.csv'.format(i) 
    df = pd.read_csv(csv_dir)
    plot_scatter(df['PREDICTIONS'], df['LABELS'], i, 1., 'IE')

## Plot the functional groups's scatter plot

In [None]:
cat_path = 'catagorized_images_path'
cat_dict = {}
ref_df = pd.read_csv('input_full_dataset_csv_path')

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

for i in os.walk(cat_path):
    for catagory in i[1]:
        cat_dict[catagory] = []
        cas_path = cat_path + '\\' + catagory
        for j in os.walk(cas_path):
            for png in j[2]:
                cas = re.findall('(.*)\.png',png)[0]
                cat_dict[catagory].append(cas)

def plot_scatter(predict, experiment, regressor,line_split, property, fg):
    num_list = []
    num_list.extend(predict)
    num_list.extend(experiment)
    mini=6
    maxi=14

    fig=plt.figure(figsize=(5,5)) 
    ax = fig.add_subplot()
    plt.scatter(experiment,predict,s=30,c='black')
    line = mlines.Line2D([mini,maxi], [mini, maxi], color='red')
    line1 = mlines.Line2D([mini,maxi - line_split], [mini + line_split, maxi], color='red')
    line2 = mlines.Line2D([mini + line_split,maxi], [mini, maxi - line_split], color='red')
    ax.add_line(line)
    ax.add_line(line1)
    ax.add_line(line2)
    plt.xlim(mini,maxi)
    plt.ylim(mini,maxi)
    plt.tight_layout()
    plt.savefig('output_png_path_{}_{}_10foldcv.png'.format(fg, regressor), dpi=500)
    plt.show()


for fg in ['alkene','radical']:
    smiles = []
    cas_list = cat_dict[fg]
    for c in cas_list:
        smiles.append(ref_df.loc[ref_df['CAS Link']==c]['smiles'].values[0])

    for i in ['AttentiveFP']:
        csv_dir = 'input_csv_path' 
        preds = []
        exp = []
        df = pd.read_csv(csv_dir)
        for smi in smiles:
            preds.append(df.loc[df['SMILES']==smi]['PREDICTIONS'].values[0])
            exp.append(df.loc[df['SMILES']==smi]['LABELS'].values[0])
        preds, exp = np.array(preds), np.array(exp)
        plot_scatter(preds, exp, i, 1., 'IE', fg)
    
    for i in ['SVR']:
        df = pd.read_excel('input_excel_path', sheet_name='SVR')
        preds = []
        exp = []
        for smi in smiles:
            preds.append(df.loc[df['smiles']==smi]['Predicted IE'].values[0])
            exp.append(df.loc[df['smiles']==smi]['IE'].values[0])
        preds, exp = np.array(preds), np.array(exp)
        plot_scatter(preds, exp, i, 1., 'IE', fg)

## Data selection

In [None]:
data = pd.read_csv('input_csv_path', index_col=0)
print(data)
plt.figure(figsize=(10,5),dpi=328)
array_ticks = np.array([1.,2.,3.,4.,5.,6.,7.]) *10
for i in data.columns:
    if i == 'SVR':
        plt.bar(array_ticks,data[i],tick_label=data.index,width=1,ec='black',linewidth=0.75)
    else:
        plt.bar(array_ticks,data[i],width=1,ec='black',linewidth=0.75)
    array_ticks = array_ticks + 1
plt.xticks(rotation=10)
plt.ylim([0.,1.2])
plt.ylabel('RMSE / eV')
plt.legend(data.columns)
plt.tight_layout()
plt.savefig('output_png_path',dpi=328)
plt.show()

## Covariance

In [None]:
df = pd.read_csv('input_csv_file') 
plt.figure(figsize=(5,3),dpi=500)
x = [20,90,160,360,640,1138]
for i in x:
    plt.vlines(i, 0, 1,colors='black',linewidth=1,linestyles='dashdot')
for i in range(8):
    plt.plot(x,df.iloc[i,1:],label=df['Algorithm'][i],linewidth=2)
plt.ylim([0.39,0.82])
plt.xlim([20,1138])
plt.ylabel('RMSE',fontsize=12)
plt.xlabel('Number of Descriptors',fontsize=12)
plt.legend(loc='upper right',framealpha=0,ncol=2)
plt.xticks(x)
plt.tight_layout()
plt.savefig('output_png_path',dpi=500)
plt.show()
