In [None]:
## load data and format it for the analysis
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import warnings
warnings.filterwarnings('ignore')

# Function to calculate Tanimoto distance between two sets of SMILES
def calculate_distances(smiles_set1, target_mol):
    distances = []
    molt = Chem.MolFromSmiles(target_mol)
    fpt = AllChem.GetMorganFingerprintAsBitVect(molt, 2)
    for smi1 in smiles_set1:
        mol1 = Chem.MolFromSmiles(smi1)
        fp1  = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
        distance = 1 - DataStructs.TanimotoSimilarity(fp1, fpt)
        distances.append(distance)
    return distances

df_mol = pd.read_csv('../data/descriptors/preprocessed_dioxirane_reactions/df_bde.csv')
smiles = list(set(df_mol.Reactant_SMILES))

# get small and complex molecules
small_smiles   = []
complex_smiles = []
for s in smiles:
    m = Chem.MolFromSmiles(s)
    atoms = [a.GetSymbol() for a in m.GetAtoms()]
    num_C = atoms.count('C')
    if num_C <= 15:
        small_smiles.append(s)
    else:
        complex_smiles.append(s)

print('Number of small molecules:', len(small_smiles))
print('Number of complex molecules:', len(complex_smiles))

# get TOP-x ranked molecules
df_res = pd.read_csv('eval_bm_Custom_RF2.csv')
df_res.columns = ['smiles', 'top-1', 'top-2', 'top-3', 'top-5', 'top-10', 'predictions']

df_res['accuracy'] = ""
for i in range(len(df_res)):
    if df_res.loc[i, 'top-1'] == 1:
        df_res.loc[i, 'accuracy'] = 'top  1'
    elif df_res.loc[i, 'top-2'] == 1:
        df_res.loc[i, 'accuracy'] = 'top  2'
    elif df_res.loc[i, 'top-3'] == 1:
        df_res.loc[i, 'accuracy'] = 'top  3'
    elif df_res.loc[i, 'top-5'] == 1:
        df_res.loc[i, 'accuracy'] = 'top  5'
    elif df_res.loc[i, 'top-10'] == 1:
        df_res.loc[i, 'accuracy'] = 'top 10'
    else:
        df_res.loc[i, 'accuracy'] = 'top 10+'

# make dataset with Tanimoto distances to small molecules
df_dist = pd.DataFrame(columns = ['target_smiles', 'accuracy', 'small_molecule', 'distance'])
df_dist_avg = pd.DataFrame(columns = ['target_smiles', 'accuracy', 'avg_distance_with_sm'])
for i in range(len(df_res)):
    target_smiles = df_res.loc[i, 'smiles']
    accuracy      = df_res.loc[i, 'accuracy']
    distances  = calculate_distances(small_smiles, target_smiles)
    for j in range(len(small_smiles)):
        df_dist = df_dist.append({'target_smiles': target_smiles,
                                  'accuracy': accuracy,
                                  'small_molecule': small_smiles[j],
                                  'distance': distances[j]}, ignore_index=True)
    df_dist_avg = df_dist_avg.append({'target_smiles': target_smiles,
                                      'accuracy': accuracy,
                                      'avg_distance_with_sm': sum(distances)/len(distances)}, ignore_index=True)

df_dist_avg.sort_values(by=['accuracy'], inplace=True)
df_dist.sort_values(by=['accuracy'], inplace=True)

Analysis of pairwise (small-complex sets) distance distributions 

In [None]:
# KDE plot of the distribution of the Tanimoto distances
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(10, 6))
sns.displot(df_dist, x="distance", kind="kde", fill=True, cut=0)
plt.show()

In [None]:
# stagggered histogram of the distribution of the Tanimoto distances colored by top-n accuracy
fig, ax = plt.subplots(1,1, figsize=(10, 6))

sns.histplot(df_dist, x="distance", 
                 hue="accuracy", 
                 #kind="kde", 
                 #cut=0,
                 common_norm=False,
                 multiple="dodge",
                 binwidth=0.05,
                 kde=True,
                 ax=ax)

ax.set_xlabel('Tanimoto distance')
ax.set_ylabel('Density')
ax.set_title('Distribution of Tanimoto distances to small molecules depending on top-n accuracy')
ax.set_xlim(0.5, 1)
plt.show()

In [None]:
# box plot of the distribution of the Tanimoto distances colored by top-n accuracy
fig, ax = plt.subplots(1,1, figsize=(10, 6))
sns.boxplot(x='accuracy', y='distance', hue='accuracy', data=df_dist, ax=ax)
#sns.swarmplot(x='accuracy', y='distance', data=df_dist, color=".25", ax=ax)
ax.set_ylim(0.2, 1)

Analysis of average distance distributions between complex molecules and the small molecules set

In [None]:
# KDE plot of the distribution of the Tanimoto average distances
sns.displot(x='avg_distance_with_sm', data=df_dist_avg, kind="kde", cut=0.05, fill=True, ax=ax)
fig, ax = plt.subplots(1,1, figsize=(10, 6))
#sns.displot(x='avg_distance_with_sm', data=df_dist_avg, kind="kde", cut=0.05, fill=True, ax=ax)
#sns.stripplot(x='avg_distance_with_sm', data=df_dist_avg, color='black', alpha=0.5, ax = ax)
sns.histplot(x='avg_distance_with_sm', data=df_dist_avg,ax=ax, binwidth=0.01, hue='accuracy', multiple='stack', common_norm=True)
#sns.jointplot(x='avg_distance_with_sm', data=df_dist_avg, kind='kde', fill=True, ax=ax)


In [None]:
sns.boxplot(x='accuracy', y='avg_distance_with_sm', hue='accuracy', data=df_dist_avg)
sns.stripplot(x='accuracy', y='avg_distance_with_sm', color="black",  data=df_dist_avg)


Change the Tanimoto fingerprint from one small molecule to the concatenated set of small molecules

In [None]:
# change the Tanimoto fingerprint from one small molecule to the concatenated set of small molecules
def MFP_smiles_set(list_smiles):
    mega_smiles = ''
    for i, s in enumerate(list_smiles):
        mega_smiles += s
        if i != len(list_smiles)-1:
            mega_smiles += '.'
    return mega_smiles

mega_smiles = MFP_smiles_set(small_smiles)

distances = calculate_distances([mega_smiles], complex_smiles[0])
df_dist_allfp = pd.DataFrame(columns = ['target_smiles', 'accuracy', 'distance'])

for i in range(len(df_res)):
    target_smiles = df_res.loc[i, 'smiles']
    accuracy      = df_res.loc[i, 'accuracy']
    distances = calculate_distances([mega_smiles], target_smiles)
    df_dist_allfp = df_dist_allfp.append({'target_smiles': complex_smiles[i],
                                          'accuracy': accuracy,
                                          'distance': distances[0]}, ignore_index=True)

df_dist_allfp.replace({'accuracy': {'top 1': "top  1", 'top 2': "top  2", 'top 3': "top  3"}}, inplace=True)
df_dist_allfp.sort_values(by=['accuracy'], inplace=True)    

In [None]:
# KDE plot of the distribution of the Tanimoto average distances
sns.displot(x='distance', data=df_dist_allfp, kind="kde", cut=0.05, fill=True, ax=ax)

In [None]:
sns.histplot(x='distance', data=df_dist_allfp, binwidth=0.01, hue='accuracy', multiple='stack', common_norm=True)

In [None]:
sns.boxplot(x='accuracy', y='distance', hue='accuracy', data=df_dist_allfp)
sns.stripplot(x='accuracy', y='distance', color="black",  data=df_dist_allfp)