To-Do: entire workflow
chemdraw parse -> combinatorial expansion -> conformer generation -> aso description -> post-processing/dimension-reduction/analysis -> plot/output data

In [1]:
import molli as ml
import molli.visual
import subprocess
import os

# This is a failsafe in case openbabel aint installed
import openbabel

In [2]:
# these are the chemdraw files that the user passes in
cores = ml.files.box_cores_test_1
subs = ml.files.box_substituents_test_1

# Make output directory - this is where everything relevant gets saved

In [3]:
# create output directory if it doesn't exist
out_dir = './ncsa-testing-output/'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# 1. ChemDraw Parsing

In [4]:
# parse the files
subprocess.run(['molli', 'parse', '--hadd', f'{cores}', '-o', f'{out_dir}BOX_cores_new_env.mlib', "--overwrite"])
subprocess.run(['molli', 'parse', '--hadd', f'{subs}', '-o', f'{out_dir}BOX_subs_new_env.mlib', "--overwrite"])

100%|██████████| 5/5 [00:00<00:00, 85.34it/s]
100%|██████████| 15/15 [00:00<00:00, 296.35it/s]


CompletedProcess(args=['molli', 'parse', '--hadd', '/home/ethangm2/NCSA Development/molli/molli/files/BOX_substituents_test_1.cdxml', '-o', './ncsa-testing-output/BOX_subs_new_env.mlib', '--overwrite'], returncode=0)

In [5]:
m_core = ml.MoleculeLibrary(f'{out_dir}BOX_cores_new_env.mlib')
print(len(m_core))
# you can index fragments directly with the string they are lablled with in the chemdraw
m_core['1']

5


Molecule(name='1', formula='C10 H12 Br2 N2 O2 Unknown2')

In [6]:
m_subs = ml.MoleculeLibrary(f'{out_dir}BOX_subs_new_env.mlib')
print(len(m_subs))
m_subs['3']

15


Molecule(name='3', formula='C3 H7 Unknown1')

# 2. Combinatorial Expansion

In [7]:
subprocess.run(
    [
        'molli',
        'combine',
        f'{out_dir}BOX_cores_new_env.mlib',
        '-s',
        f'{out_dir}BOX_subs_new_env.mlib',
        '-j',
        '96', 
        '-o', 
        f'{out_dir}test_combine_new_env.mlib', 
        '-a', 
        'A1', 
        '--obopt', 
        'uff',
        '-m',
        'same',
        "--overwrite"
    ]
)


Will create a library of size 75


100%|██████████| 75/75 [00:02<00:00, 36.65it/s]


CompletedProcess(args=['molli', 'combine', './ncsa-testing-output/BOX_cores_new_env.mlib', '-s', './ncsa-testing-output/BOX_subs_new_env.mlib', '-j', '96', '-o', './ncsa-testing-output/test_combine_new_env.mlib', '-a', 'A1', '--obopt', 'uff', '-m', 'same', '--overwrite'], returncode=0)

In [8]:
combined = ml.MoleculeLibrary(f'{out_dir}test_combine_new_env.mlib')
print(len(combined))
# you index full catalysts structures with the concatenated core_substituent_substituent string
combined["1_3_3"]

75


Molecule(name='1_3_3', formula='C16 H26 Br2 N2 O2')

In [9]:
combined["3_6_6"]

Molecule(name='3_6_6', formula='C30 H38 Br2 N2 O2')

# 3. Conformer Generation

In [10]:
subprocess.run(['molli', 
                'conformers', 
                f'{out_dir}test_combine_new_env.mlib', 
                '-n', 
                '50', 
                '-o', 
                f'{out_dir}test_conformers_new_env.mlib', 
                '-t', 
                '-j', ### !!!!!! Number of jobs. Please scale down if host system has fewer cores. defaults to os.cpu_count()//2  !!!!! ###
                '96',
                "--overwrite"
                ])

100%|██████████| 75/75 [02:12<00:00,  1.76s/it]


CompletedProcess(args=['molli', 'conformers', './ncsa-testing-output/test_combine_new_env.mlib', '-n', '50', '-o', './ncsa-testing-output/test_conformers_new_env.mlib', '-t', '-j', '96', '--overwrite'], returncode=0)

In [11]:
clib = ml.ConformerLibrary(f'{out_dir}test_conformers_new_env.mlib')
print(len(clib))

75


In [12]:
i = 0
for conf in clib:
    i += conf.n_conformers
    # print(conf)
print(str(i) + ' conformers in library')

# many of these conformers ar redundant - redundant confs thrown out during aso calculation

8028 conformers in library


In [13]:
print(clib[0])

clib['1_3_3'][0]

ConformerEnsemble(name='1_3_3', formula='C16 H26 Br2 N2 O2', n_conformers=150)


Conformer(name='1_3_3', formula='C16 H26 Br2 N2 O2')

In [14]:
clib['1_3_3'][1]

Conformer(name='1_3_3', formula='C16 H26 Br2 N2 O2')

In [15]:
clib['3_6_6'][0]

Conformer(name='3_6_6', formula='C30 H38 Br2 N2 O2')

In [16]:
clib['3_6_6'][24]

Conformer(name='3_6_6', formula='C30 H38 Br2 N2 O2')

# 4. ASO descriptor calculation

In [17]:
# first we make a grid for calculating aso
subprocess.run(['molli', 
                'grid', 
                '--mlib', 
                f'{out_dir}test_conformers_new_env.mlib', 
                '-o', 
                f'{out_dir}grid_new_env.npy'
                ])

(7106, 3)


100%|██████████| 75/75 [00:00<00:00, 1676.67it/s]


CompletedProcess(args=['molli', 'grid', '--mlib', './ncsa-testing-output/test_conformers_new_env.mlib', '-o', './ncsa-testing-output/grid_new_env.npy'], returncode=0)

In [18]:
# calculate aso
subprocess.run(['molli', 
                'gbca', 
                'aso', 
                f'{out_dir}test_conformers_new_env.mlib', 
                '-g', 
                f'{out_dir}grid_new_env.npy', 
                '-o', 
                f'{out_dir}aso_new_env.h5'
                ])
# tqdm looks messed up

Allocating storage for descriptors
Will compute descriptor ASO using 128 cores.
Grid shape: (7106, 3)


Loading batches of conformers:   0%|          | 0/1 [00:00<?, ?it/s]
Submitting calculations:   0%|          | 0/75 [00:00<?, ?it/s][A
Submitting calculations:   0%|          | 0/75 [00:00<?, ?it/s][A
Submitting calculations:   5%|▌         | 4/75 [00:00<00:02, 27.85it/s][A
Submitting calculations:  12%|█▏        | 9/75 [00:01<00:02, 25.52it/s][A
Submitting calculations:  17%|█▋        | 13/75 [00:01<00:03, 16.67it/s][A
Submitting calculations:  20%|██        | 15/75 [00:01<00:05, 10.15it/s][A
Submitting calculations:  23%|██▎       | 17/75 [00:02<00:08,  7.08it/s][A
Submitting calculations:  25%|██▌       | 19/75 [00:02<00:07,  7.03it/s][A
Submitting calculations:  32%|███▏      | 24/75 [00:03<00:06,  7.33it/s][A
Submitting calculations:  37%|███▋      | 28/75 [00:04<00:07,  6.16it/s][A
Submitting calculations:  39%|███▊      | 29/75 [00:05<00:13,  3.31it/s][A
Submitting calculations:  44%|████▍     | 33/75 [00:05<00:09,  4.54it/s][A
Submitting calculations:  52%|█████▏   

CompletedProcess(args=['molli', 'gbca', 'aso', './ncsa-testing-output/test_conformers_new_env.mlib', '-g', './ncsa-testing-output/grid_new_env.npy', '-o', './ncsa-testing-output/aso_new_env.h5'], returncode=0)

# 5. post-processing, dimensionaltiy reduction, clustering analysis

In [19]:
subprocess.run(
     [ 
         'molli', 
         'cluster', 
         f'{out_dir}aso_new_env.h5', 
         '-o', 
         f'{out_dir}new_env_data_test.json', 
         '-v', # variance threshold before doing clustering
         '0', # remove 0 variance columns
         '-c', # correlation cutoff before clustering
         '0.8', # 0.8 by default
     ]
 )

100%|██████████| 75/75 [00:00<00:00, 4520.63it/s]


shape of data after variance threshold: (75, 3792)
total variance after variance threshold: 62.84

shape of data after removing correlated columns (R > 0.8): (75, 249)
total variance after removing correlated columns (R > 0.8): 3.42



CompletedProcess(args=['molli', 'cluster', './ncsa-testing-output/aso_new_env.h5', '-o', './ncsa-testing-output/new_env_data_test.json', '-v', '0', '-c', '0.8'], returncode=0)

In [21]:
subprocess.run(
     [ 
         'molli', 
         'cluster', 
         f'{out_dir}aso_new_env.h5', 
         '-o', 
         f'{out_dir}new_env_data_pca.json', 
         '-m',
         'pca',
         '-v', # variance threshold before doing clustering
         '0', # remove 0 variance columns
         '-c', # correlation cutoff before clustering
         '0.8', # 0.8 by default
     ]
 )

100%|██████████| 75/75 [00:00<00:00, 4635.14it/s]


shape of data after variance threshold: (75, 3792)
total variance after variance threshold: 62.84

shape of data after removing correlated columns (R > 0.8): (75, 249)
total variance after removing correlated columns (R > 0.8): 3.42



CompletedProcess(args=['molli', 'cluster', './ncsa-testing-output/aso_new_env.h5', '-o', './ncsa-testing-output/new_env_data_pca.json', '-m', 'pca', '-v', '0', '-c', '0.8'], returncode=0)