In [1]:
"""File containing the most common reactions and molecules/groups."""
from rdkit import Chem  # type: ignore[import-not-found]
from rdkit.Chem import rdChemReactions  # type: ignore[import-not-found]

# Take a block with fmoc and attach the block to a triazine core
FMOC_TRIAZINE_REACTION_SMARTS = "[*:1]C(=O)OCC1c2ccccc2-c2ccccc21>>c1nc([*:1])ncn1"
FMOC_TRIAZINE_REACTION = rdChemReactions.ReactionFromSmarts(FMOC_TRIAZINE_REACTION_SMARTS)

# NH group with triazine core reaction
# 1 = Triazine has one preexisting attachment
# 2 = Triazine has two preexisting attachments
NH2_TRIAZINE_REACTION1_SMARTS = "[cH:1]1nc([*:3])ncn1.[NH2;!$(N-[#66]):2]>>[c:1]([*:3])1ncnc([NH:2])n1"
NH2_TRIAZINE_REACTION2_SMARTS = "[cH:1]1nc([*:3])nc([*:4])n1.[NH2;!$(N-[#66]):2]>>[c:1]([*:3])1nc([*:4])nc([NH:2])n1"
NH2_TRIAZINE_REACTION1 = rdChemReactions.ReactionFromSmarts(NH2_TRIAZINE_REACTION1_SMARTS)
NH2_TRIAZINE_REACTION2 = rdChemReactions.ReactionFromSmarts(NH2_TRIAZINE_REACTION2_SMARTS)

# Boronate and halide reaction smart
BORONATE_HALIDE_REACTION_SMARTS = "[*:1]([I,Br,Cl,F]).[*:2]B(O)(O)>>[*:1]([*:2])"
BORONATE_HALIDE_REACTION = rdChemReactions.ReactionFromSmarts(BORONATE_HALIDE_REACTION_SMARTS)

# COOH BOC and FMOC reactions
COOH_BOC_REACTION_SMARTS = "[*:1]C(=O)OC(C)(C)C.[*:2]C(=O)[O;H1]>>[*:1]C(=O)[*:2]"
COOH_FMOC_REACTION_SMARTS = "[*:1]C(=O)OCC1c2ccccc2-c2ccccc21.[*:2]C(=O)[O;H1]>>[*:1]C(=O)[*:2]"
COOH_BOC_REACTION = rdChemReactions.ReactionFromSmarts(COOH_BOC_REACTION_SMARTS)
COOH_FMOC_REACTION = rdChemReactions.ReactionFromSmarts(COOH_FMOC_REACTION_SMARTS)

# Groups
ACID_SMARTS = "C(=O)[O;H1]"
ACID = Chem.MolFromSmarts(ACID_SMARTS)
ESTER_SMARTS = "*C(=O)O*"
ESTER = Chem.MolFromSmarts(ESTER_SMARTS)
BORONATE_SMARTS = "B(O)(O)"
BORONATE = Chem.MolFromSmarts(BORONATE_SMARTS)
HALOGEN_SMARTS = "[I,Br,Cl,F]"
HALOGEN = Chem.MolFromSmarts(HALOGEN_SMARTS)

# Common molecules/substructures
FMOC_SMARTS = "O=COCC1c2ccccc2-c2ccccc21"
FMOC = Chem.MolFromSmarts(FMOC_SMARTS)
BOC_SMARTS = "C(=O)OC(C)(C)C"
BOC = Chem.MolFromSmarts(BOC_SMARTS)
TRIAZINE_SMARTS = "c1ncncn1"
TRIAZINE = Chem.MolFromSmarts(TRIAZINE_SMARTS)
FLUORIDE_ACID_SMARTS = "FC(F)(F)C(=O)[O;H1]"
FLUORIDE_ACID = Chem.MolFromSmarts(FLUORIDE_ACID_SMARTS)
DNA_SMARTS = "C(=O)N[Dy]"
DNA = Chem.MolFromSmiles(DNA_SMARTS)

In [3]:
def sanitize_molecule(molecule: Chem.Mol) -> None:
    """Sanitize molecule to check if it is correct.

    :param molecule
    """
    try:
        Chem.SanitizeMol(molecule)
    except Chem.MolSanitizeException as e:
        raise ValueError(f"Sanitization of {Chem.MolToSmiles(molecule)} failed") from e

In [75]:
def bbs_to_molecule(bb1: str, bb2: str, bb3: str) -> str:
    """Combine building blocks into larger molecule.

    :param bb1: SMILE of bb1
    :param bb2: SMILE of bb2
    :param bb3: SMILE of bb3
    :return: SMILE of molecule
    """
    BB1 = Chem.MolFromSmiles(bb1)
    BB2 = Chem.MolFromSmiles(bb2)
    BB3 = Chem.MolFromSmiles(bb3)

    # If BB1 has FMOC and BB2 don't have BORONATE and BB3 doesn't have acid
    result = BB1 
    bb1_fmoc_substruct_match = BB1.HasSubstructMatch(FMOC)
    test = BB1.HasSubstructMatch(HALOGEN) and BB2.HasSubstructMatch(BORONATE) and BB3.HasSubstructMatch(ACID) and not BB3.HasSubstructMatch(FLUORIDE_ACID)
    # print(bb1_fmoc_substruct_match, bb2_boronate_substruct_match, bb3_cooh_substruct_match)

    if bb1_fmoc_substruct_match and not test:
        # Use FMOC_TRIAZINE_REACTION to replace FMOC with triazine
        products = FMOC_TRIAZINE_REACTION.RunReactants([BB1])
        if len(products) == 0:
            raise ValueError(f"No products were generated from fmoc triazine reaction - bb1: {BB1}")
        result = products[0][0]
        sanitize_molecule(result)

    if result is not None and result.HasSubstructMatch(ACID):
        # Replace substructs COOH with CONHDy
        result = Chem.ReplaceSubstructs(result, ACID, DNA)[-1]
        sanitize_molecule(result)
    else:
        raise ValueError("Can't attach DNA to first building block")

    if result is not None and result.HasSubstructMatch(TRIAZINE):
        # Use NH2_TRIAZINE_REACTION1 to add BB2 to triazine
        products = NH2_TRIAZINE_REACTION1.RunReactants((result, BB2))
        if len(products) == 0:
            raise ValueError("No products were generated from nh2 triazine reaction 1")
        result = products[0][0]
        sanitize_molecule(result)

        # Use NH2_TRIAZINE_REACTION2 to add BB3 to triazine
        products = NH2_TRIAZINE_REACTION2.RunReactants((result, BB3))
        if len(products) == 0:
            raise ValueError("No products were generated from nh2 triazine reaction 2")
        result = products[0][0]
        sanitize_molecule(result)

    elif result is not None and result.HasSubstructMatch(HALOGEN):
        result = boronate_cooh_reactions(result, BB2=BB2, BB3=BB3)
    else:
        raise ValueError("Molecule does not contain triazine or halogen group")

    if result is None:
        raise ValueError(f"No molecule was generated from bb1:{bb1}, bb2:{bb2}, bb3:{bb3}")

    return Chem.MolToSmiles(result)


def boronate_cooh_reactions(result: Chem.Mol, BB2: Chem.Mol, BB3: Chem.Mol) -> Chem.Mol:
    """Add BB2 and BB3 to result via boronate and cooh reactions.

    :param result: The molecule result so far
    :param BB2: The second building block
    :param BB3: The third building block
    :return: The resulting molecule.
    """
    # Use BORONATE_HALIDE_REACTION to attach BB2 to BB1
    products = BORONATE_HALIDE_REACTION.RunReactants((result, BB2))
    if len(products) == 0:
        raise ValueError("No products were generated from boronate halide reaction")
    result = products[0][0]
    sanitize_molecule(result)

    # Use COOH_BOC_REACTION or COOH_FMOC_REACTION to attach BB3 to BB1
    if result.HasSubstructMatch(FMOC):
        products = COOH_FMOC_REACTION.RunReactants((result, BB3))
        if len(products) == 0:
            raise ValueError("No products generated from COOH_FMOC_REACTION")
        result = products[0][0]
    elif result.HasSubstructMatch(BOC):
        products = COOH_BOC_REACTION.RunReactants((result, BB3))
        if len(products) == 0:
            raise ValueError("No products were generated from COOH_BOC_REACTION")
        result = products[0][0]
    else:
        raise ValueError("BB1 has no BOC or FMOC group for COOH reaction")
    sanitize_molecule(result)

    return result

In [77]:
import duckdb
nrows = 100000
train_path = "../data/raw/train.parquet"
test_path = "../data/raw/test.parquet"
df = duckdb.query(f'SELECT * FROM "{train_path}" LIMIT {nrows};').df()

In [73]:
# Take second half of the dataframe
# df = df.iloc[1000000:]

In [78]:
# Check if molecule_smiles is equal to the generated molecule
# Display the first 5 rows in a grid view where
grid = []
for i, row in df.iterrows():
    try:
        molecule = bbs_to_molecule(row["buildingblock1_smiles"], row["buildingblock2_smiles"], row["buildingblock3_smiles"])
    except ValueError as e:
        print(f"Row {i} failed with error: {e}")
        continue
    molecule = Chem.MolFromSmiles(molecule)
    bb1 = Chem.MolFromSmiles(row["buildingblock1_smiles"])
    bb2 = Chem.MolFromSmiles(row["buildingblock2_smiles"])
    bb3 = Chem.MolFromSmiles(row["buildingblock3_smiles"])
    target_molecule = Chem.MolFromSmiles(row["molecule_smiles"])
    if molecule is None:
        print(f"Row {i} has invalid molecule")
        continue
    if not molecule.HasSubstructMatch(target_molecule):
        grid.append([bb1, bb2, bb3, molecule, target_molecule])


if len(grid) == 0:
    print("All reactions generated the correct molecule")
else:
    img = Chem.Draw.MolsToGridImage([mol for row in grid for mol in row], 
                                 molsPerRow=5, maxMols=500, subImgSize=(300, 300)) 
    display(img)

All reactions generated the correct molecule


In [59]:
# Debug rows that failed
row = 75039
row_data = df.loc[row]
# Display the row data
row_grid = []
try:
    molecule = bbs_to_molecule(row_data["buildingblock1_smiles"], row_data["buildingblock2_smiles"], row_data["buildingblock3_smiles"])
except ValueError as e:
    print(f"Row {row} failed with error: {e}")
    bb1 = Chem.MolFromSmiles(row_data["buildingblock1_smiles"])
    bb2 = Chem.MolFromSmiles(row_data["buildingblock2_smiles"])
    bb3 = Chem.MolFromSmiles(row_data["buildingblock3_smiles"])
    target_molecule = Chem.MolFromSmiles(row_data["molecule_smiles"])
    row_grid.append(bb1)
    row_grid.append(bb2)
    row_grid.append(bb3)
    row_grid.append(target_molecule)
    img = Chem.Draw.MolsToGridImage(row_grid, molsPerRow=4, maxMols=500, subImgSize=(300, 300))
    display(img)

# Display the generated molecule
row_data["buildingblock3_smiles"]


'Nc1ccsc1.O=C(O)C(=O)O'