# Workflow Example: Copper-BOX Catalyst Library

This notebook will show a full example of how you might use molli to generate a combinatorial catalyst library and extract features.

In [58]:
import molli as ml
import molli.visual
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import time
import msgpack
import attrs

# Part 1: Parsing and Combinatorial Expansion

Here's how to do combinatorial expansion programmatically in Python

In [59]:
# first we read in our cores
cores = ml.CDXMLFile(ml.files.BOX_cores)

# we will use this to step through the cores and visualize
keys = list(cores.keys())

i = -1


In [60]:
# control + enter on this cell to visualize BOX cores one by one

try:
    i += 1
    # note each core has 4 attachment points, 2 labelled A1 (the pendant 4,4' positions), and 2 labelled A2 (the bridging position).
    attachment_points = [a for a in cores[keys[i]].atoms if a.atype == ml.AtomType.AttachmentPoint]
    attachment_points
    # visualize the core
    cores[keys[i]]
except IndexError:
    "Restarting from beginning!"
    i = -1

# these look good, aside from some weird geometry that we will iron out later

[Atom(element=Unknown, isotope=None, label='A1', formal_charge=0, formal_spin=0),
 Atom(element=Unknown, isotope=None, label='A1', formal_charge=0, formal_spin=0),
 Atom(element=Unknown, isotope=None, label='A2', formal_charge=0, formal_spin=0),
 Atom(element=Unknown, isotope=None, label='A2', formal_charge=0, formal_spin=0)]

Molecule(name='2_2', formula='C23 Cl2 Cu1 N2 O2 Unknown4')

In [61]:
# now we will expand the bridging groups
bridging_groups = ml.CDXMLFile(ml.files.BOX_bridge)

# this will be our exapnded collection
cores_and_bridge_mols = []

for core in cores.keys():
    # get our core structure
    core_mol = cores[core]

    # make a molecule for each core + bridging groups
    for bridging_fragment in bridging_groups.keys():
        # get our bridge structure - molli assigns default 'AP1' label to attachment points without chemdraw-specified labels
        bridge_mol = bridging_groups[bridging_fragment]
        # join one bridging group fragment
        expanded = ml.Molecule.join(core_mol, bridge_mol, 'A2', 'AP1')
        #join the other
        expanded = ml.Molecule.join(expanded, bridge_mol, 'A2', 'AP1', name = '_'.join([core_mol.name, bridge_mol.name]))
        # add to our collection
        cores_and_bridge_mols.append(expanded)

In [62]:
# we can see we exapnded to 18 structures
len(cores_and_bridge_mols)
# let's visualize
i = -1

18

In [63]:
# control + enter on this cell to visualize BOX cores one by one

try:
    i += 1
    # visualize the expanded structure
    cores_and_bridge_mols[i]
except IndexError:
    "Restarting from beginning!"
    i = -1

Molecule(name='2_2_H', formula='C23 H2 Cl2 Cu1 N2 O2 Unknown2')

In [64]:
# now we will expand the 4-positions as above

# now we will expand the bridging groups
four_positions = ml.CDXMLFile(ml.files.BOX_4_position)

# this will be our exapnded collection
fully_expanded_mols = []

for core_and_bridge in cores_and_bridge_mols:

    # make a molecule for each core + bridging groups
    for pos4 in four_positions.keys():
        # get our 4-position fragment
        pos4_mol = four_positions[pos4]
        # join one 4-position group fragment
        expanded = ml.Molecule.join(core_and_bridge, pos4_mol, 'A1', 'AP1')
        #join the other
        expanded = ml.Molecule.join(expanded, pos4_mol, 'A1', 'AP1', name = '_'.join([ pos4_mol.name, core_and_bridge.name]))
        # add to our collection
        fully_expanded_mols.append(expanded)

In [65]:
# we can see we exapnded to 1152 structures
len(fully_expanded_mols)
# let's visualize
i = -1

1152

In [66]:
# control + enter on this cell to visualize every 50th BOX ligand

try:
    i += 50
    # visualize the expanded structure
    fully_expanded_mols[i]
except IndexError:
    "Restarting from beginning!"
    i = -1

Molecule(name='420_2_2_H', formula='C45 H2 Cl2 Cu1 N2 O2')

In [67]:
# finally, we'll do an optimization with XTB gfn2 to clean up our geometries a bit
from molli.external import openbabel as mob
from joblib import delayed,Parallel
from molli.external.xtb import XTBDriver

for mol in fully_expanded_mols:
    mol.add_implicit_hydrogens()


In [None]:
#### FOR ALEX 1 #### NOT RUN BECAUSE IT DOESN'T WORK FOR ME (UP TO DATE WITH DEV)

# this seems clunky... Why can't I create a non-serialized mlib from a list??? 
source = ml.MoleculeLibrary('my_library.mlib', readonly=False)
with source.writing():
    for mol in fully_expanded_mols:
        source[mol.name] = mol

#Also this code doesn't work at all in my freshly pulled version of dev because pipeline has not been added to init.
cache = ml.storage.Collection[ml.pipeline.JobOutput](
    "/home/colen2/.molli/scratch",
    backend=ml.storage.DirCollectionBackend,
    value_encoder=lambda x: msgpack.dumps(attrs.asdict(x)),
    value_decoder=lambda x: ml.pipeline.JobOutput(msgpack.loads(x)),
    readonly=False,
    overwrite=False,
)

error = ml.storage.Collection[ml.pipeline.JobOutput](
    "/home/colen2/.molli/error",
    backend=ml.storage.DirCollectionBackend,
    value_encoder=lambda x: msgpack.dumps(attrs.asdict(x)),
    value_decoder=lambda x: ml.pipeline.JobOutput(msgpack.loads(x)),
    readonly=False,
    overwrite=False,
)

target = ml.MoleculeLibrary(
    "dest.mlib",
    overwrite=True,
    readonly=False,
    comment="We did it!",
)

xtb = XTBDriver(nprocs=1)

# how do I set level of theory for xtb optimize????
ml.pipeline.jobmap(
    xtb.optimize,
    source,
    target,
    cache=None,
    error_cache=None,
    n_workers=16,
    batch_size=4,
    scratch_dir="/home/colen2/.molli/scratch",
)

In [None]:
#### FOR ALEX 2 #### NOT RUN BECAUSE IT IS CRAZY SLOW AND DOES NOT SEEM TO TAKE ADVANTAGE OF PARALLELIZATION

# let's see if we use the old way of parallelizing things, if that works

# here's optimization with obabel UFF. This is SUPER slow. It does not look like molli is taking advantage of the threading at all.
    
res = Parallel(n_jobs=32, verbose=50,prefer='threads')(
delayed(mob.obabel_optimize)(
    mol= mol, 
    ff="UFF",
    ) for mol in fully_expanded_mols)


In [68]:
#### FOR ALEX 3 #### ONLY ONE THAT WORKS QUICKLY, BUT GIVES UNREASONABLE STRUCTURES (SEE BELOW)

# let's see if we use the old way of parallelizing things, if that works

# XTB GFF optimization makes the structures look super weird. See for yourself if you uncomment this part of the code.
# Also, we should homogenize the argument names for all the drivers ("ff" vs. "method", "M" vs. "mol")

# first we write an mlib object out... Does this have to be serialized? It is very slow either way...
source = ml.MoleculeLibrary('my_library.mlib', readonly=False)
with source.writing():
    for mol in fully_expanded_mols:
        source[mol.name] = mol
    
from molli.external.xtb import XTBDriver

with source.reading():
    # optimize geometry
    xtb = XTBDriver(nprocs=4)
    res = Parallel(n_jobs=32, verbose=50,prefer='threads')(
    delayed(xtb.optimize)(
        M= source[m_name], 
        method="gfF", # if you use GFN2, this is extra super slow, which makes sense I guess. GFF fucks up the structures. I'd like to use obabel uff, but it is insanely slow (see above). Also, the caching is still fucky... If I make this call with gff, and then make it again with gfn2, it should not find cached results yum because they are different jobs, and yet it does.
        ) for m_name in source)

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   1 tasks      | elapsed:   34.9s
[Parallel(n_jobs=32)]: Done   2 tasks      | elapsed:   37.6s
[Parallel(n_jobs=32)]: Done   3 tasks      | elapsed:   42.3s
[Parallel(n_jobs=32)]: Done   4 tasks      | elapsed:   42.5s
[Parallel(n_jobs=32)]: Done   5 tasks      | elapsed:   45.1s
[Parallel(n_jobs=32)]: Done   6 tasks      | elapsed:   46.0s
[Parallel(n_jobs=32)]: Done   7 tasks      | elapsed:   47.7s
[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed:   53.7s
[Parallel(n_jobs=32)]: Done   9 tasks      | elapsed:   56.2s
[Parallel(n_jobs=32)]: Done  10 tasks      | elapsed:   57.3s
[Parallel(n_jobs=32)]: Done  11 tasks      | elapsed:   58.2s
[Parallel(n_jobs=32)]: Done  12 tasks      | elapsed:   58.9s
[Parallel(n_jobs=32)]: Done  13 tasks      | elapsed:  1.1min
[Parallel(n_jobs=32)]: Done  14 tasks      | elapsed:  1.2min
[Parallel(n_jobs=32)]: Done  15 tasks      | elaps

KeyboardInterrupt: 

In [56]:
# let's visualize
i = -1

In [57]:
# control + enter on this cell to visualize every 50th BOX ligand

try:
    i += 50
    # visualize the expanded structure
    res[i]
except IndexError:
    "Restarting from beginning!"
    i = -1

NameError: name 'res' is not defined