In [None]:
# load dataset with small and complex molecules

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw

df_mol = pd.read_csv('../data/descriptors/preprocessed_dioxirane_reactions/df_bde.csv')
smiles = list(set(df_mol.Reactant_SMILES))

# get small and complex molecules
small_smiles   = []
complex_smiles = []
for s in smiles:
    m = Chem.MolFromSmiles(s)
    atoms = [a.GetSymbol() for a in m.GetAtoms()]
    num_C = atoms.count('C')
    if num_C <= 15:
        small_smiles.append(Chem.CanonSmiles(s))
    else:
        complex_smiles.append(Chem.CanonSmiles(s))

print('Number of small molecules:', len(small_smiles))
print('Number of complex molecules:', len(complex_smiles))

# load BRICS and pBRICS decompositions
import pickle
with open('decomposition_data/BRICS_dioxirane.pkl', 'rb') as file:
    data_brics = pickle.load(file)

with open('decomposition_data/pBRICS_dioxirane.pkl', 'rb') as file:
    data_pbrics = pickle.load(file)

data_pbrics = dict(data_pbrics)

# canonicalize smiles
for k, v in data_brics.items():
    data_brics[Chem.CanonSmiles(k)] = [Chem.CanonSmiles(x) for x in v]
for k, v in data_pbrics.items():
    data_pbrics[Chem.CanonSmiles(k)] = [Chem.CanonSmiles(x) for x in v]

# remove the numbers of atoms in the fragments and reduce the number of fragments when duplicates appear
import re
def remove_dummy_atom_numbers(fragments):
    fragments_new = []
    for frag in fragments:
        frag_new = re.sub(r'\[\d+\*\]', '[*]', frag)
        frag_new = frag_new.replace('*', '[*]')
        frag_new = frag_new.replace(r'[[', '[')
        frag_new = frag_new.replace(r']]', ']')
        fragments_new.append(frag_new)
    return fragments_new

for k, v in data_brics.items():
    data_brics[k] = list(set(remove_dummy_atom_numbers(v)))

for k, v in data_pbrics.items():
    data_pbrics[k] = list(set(remove_dummy_atom_numbers(v)))

len(data_brics), len(data_pbrics)

# BRICS comparison between small and complex molecules datasets

In [None]:
brics_small = []
brics_complex = []
for s in small_smiles:
    brics_small +=  data_brics[s]
for s in complex_smiles:
    brics_complex +=  data_brics[s]
brics_small = list(set(brics_small))
brics_complex = list(set(brics_complex))
   
# unique fragments
print(f"Number of unique fragments:")
print(f"small molecules: {len(brics_small)}")
print(f"complex molecules: {len(brics_complex)}")

# intersection
print(f"Number of common fragments: {len(set(brics_small) & set(brics_complex))}")

# fragments that appear in complex molecules but not in small molecules:
print(f"Number of fragments that appear in complex molecules but not in small molecules: {len(set(brics_complex) - set(brics_small))}")

# fragments that appear in small molecules but not in complex molecules:
print(f"Number of fragments that appear in small molecules but not in complex molecules: {len(set(brics_small) - set(brics_complex))}")

In [None]:
# fragments that appear in both small and complex molecules
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list(set(brics_small) & set(brics_complex))], 
                     molsPerRow=6, subImgSize=(200, 200))

In [None]:
# fragments that appear in small molecules but not in complex molecules:
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list(set(brics_small) - set(brics_complex))], 
                     molsPerRow=6, subImgSize=(200, 200),
                     maxMols=60)

In [None]:
# fragments that appear in complex molecules but not in small molecules:
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list( set(brics_complex) - set(brics_small) )], 
                     molsPerRow=6, subImgSize=(200, 200), maxMols=40)

# pBRICS comparison between small and complex molecules datasets

In [None]:
# pBRICS
pbrics_small = []
pbrics_complex = []
for s in small_smiles:
    pbrics_small +=  data_pbrics[s]
for s in complex_smiles:
    pbrics_complex +=  data_pbrics[s]
pbrics_small = list(set(pbrics_small))
pbrics_complex = list(set(pbrics_complex))

common = list(set(pbrics_small) & set(pbrics_complex))
# unique fragments
print(f"Number of unique fragments:")
print(f"small molecules: {len(pbrics_small)}")
print(f"complex molecules: {len(pbrics_complex)}")

# intersection
print(f"Number of common fragments: {len(list(set(set(pbrics_small) & set(pbrics_complex))))}")

# fragments that appear in complex molecules but not in small molecules:
print(f"Number of fragments that appear in complex molecules but not in small molecules: {len(set(pbrics_complex) - set(pbrics_small))}")

# fragments that appear in small molecules but not in complex molecules:
print(f"Number of fragments that appear in small molecules but not in complex molecules: {len(set(pbrics_small) - set(pbrics_complex))}")

In [None]:
# fragments that appear in both small and complex molecules
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list(set(pbrics_small) & set(pbrics_complex))], 
                     molsPerRow=6, subImgSize=(200, 200),
                     maxMols=60)

In [None]:
# fragments that appear in small molecules but not in complex molecules:
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list(set(pbrics_small) - set(pbrics_complex))], 
                     molsPerRow=6, subImgSize=(200, 200),
                     maxMols=60)

In [None]:
# fragments that appear in complex molecules but not in small molecules:
Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in list( set(pbrics_complex) - set(pbrics_small))], 
                     molsPerRow=6, subImgSize=(200, 200),
                     maxMols=40)