In [4]:
from rdkit import Chem

In [15]:
# 1. Define the molecule and the substructure query

# [1*]C(=O)C[4*].[4*]CC(O)COC(=O)CCCCCN[5*]|[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2] |0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5
smiles = "[4*]CC(O)COC(=O)CCCCCN[5*]"
smarts = "[N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]"

# 2. Create molecule objects from SMILES and SMARTS
mol = Chem.MolFromSmiles(smiles)
pat = Chem.MolFromSmarts(smarts)

# 3. Find all matches of the SMARTS pattern
matches = mol.GetSubstructMatches(pat)

# 4. Extract the SMILES for each match
if matches:
    for match in matches:
        # Create a new molecule object from the matched atoms
        subgraph_mol = Chem.PathToSubmol(mol, match)

        # Get the SMILES for the new molecule object
        substructure_smiles = Chem.MolToSmiles(subgraph_mol)
        print(f"Match atom indices: {match}")
        print(f"Substructure SMILES: {substructure_smiles}")
else:
    print("No matching substructure found.")

Match atom indices: (13,)
Substructure SMILES: [5*]N


In [None]:
environs = {
    '[1*]': '[C;D3]([#0,#6,#7,#8])(=O)',
    #
    # After some discussion, the L2 definitions ("N.pl3" in the original
    # paper) have been removed and incorporated into a (almost) general
    # purpose amine definition in L5 ("N.sp3" in the paper).
    #
    # The problem is one of consistency.
    #    Based on the original definitions you should get the following
    #    fragmentations:
    #      C1CCCCC1NC(=O)C -> C1CCCCC1N[2*].[1*]C(=O)C
    #      c1ccccc1NC(=O)C -> c1ccccc1[16*].[2*]N[2*].[1*]C(=O)C
    #    This difference just didn't make sense to us. By switching to
    #    the unified definition we end up with:
    #      C1CCCCC1NC(=O)C -> C1CCCCC1[15*].[5*]N[5*].[1*]C(=O)C
    #      c1ccccc1NC(=O)C -> c1ccccc1[16*].[5*]N[5*].[1*]C(=O)C
    #
    #'L2':'[N;!R;!D1;!$(N=*)]-;!@[#0,#6]',
    # this one turned out to be too tricky to define above, so we set it off
    # in its own definition:
    #'L2a':'[N;D3;R;$(N(@[C;!$(C=*)])@[C;!$(C=*)])]',
    '[3*]': '[O;D2]-;!@[#0,#6,#1]',
    '[4*]': '[C;!D1;!$(C=*)]-;!@[#6]',
    #'L5':'[N;!D1;!$(N*!-*);!$(N=*);!$(N-[!C;!#0])]-[#0,C]',
    '[5*]': '[N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]',
    '[6*]': '[C;D3;!R](=O)-;!@[#0,#6,#7,#8]',
    '[7*]': '[C;D2,D3]-[#6]',
    '[8*]': '[C;!R;!D1]-;!@[#6]',
    '[8*]': '[C;!R;!D1;!$(C!-*)]',
    '[9*]': '[n;+0;$(n(:[c,n,o,s]):[c,n,o,s])]',
    '[10*]': '[N;R;$(N(@C(=O))@[C,N,O,S])]',
    '[11*]': '[S;D2](-;!@[#0,#6])',
    '[12*]': '[S;D4]([#6,#0])(=O)(=O)',
    '[13*]': '[C;$(C(-;@[C,N,O,S])-;@[N,O,S])]',
    '[14*]': '[c;$(c(:[c,n,o,s]):[n,o,s])]',
    '[15*]': '[C;$(C(-;@C)-;@C)]',
    '[16*]': '[c;$(c(:c):c)]',
}

reactionDefs = (
    # L1
    ('1','3','-'),
    ('1','5','-'),
    ('1','10','-'),
    # L3
    ('3','4','-'),
    ('3','13','-'),
    ('3','14','-'),
    ('3','15','-'),
    ('3','16','-'),
    # L4
    ('4','5','-'),
    ('4','11','-'),
    # L5
    ('5','12','-'),
    ('5','14','-'),
    ('5','16','-'),
    ('5','13','-'),
    ('5','15','-'),
    # L6
    ('6','13','-'),
    ('6','14','-'),
    ('6','15','-'),
    ('6','16','-'),
    # L7
    ('7a','7b','='),
    # L8
    ('8','9','-'),
    ('8','10','-'),
    ('8','13','-'),
    ('8','14','-'),
    ('8','15','-'),
    ('8','16','-'),
    # L9
    ('9','13','-'),# not in original paper
    ('9','14','-'),# not in original paper
    ('9','15','-'),
    ('9','16','-'),
    # L10
    ('10','13','-'),
    ('10','14','-'),
    ('10','15','-'),
    ('10','16','-'),
    ('11','13','-'),
    ('11','14','-'),
    ('11','15','-'),
    ('11','16','-'),
    # L13
    ('13','14','-'),
    ('13','15','-'),
    ('13','16','-'),
    # L14
    ('14','14','-'),# not in original paper
    ('14','15','-'),
    ('14','16','-'),
    # L15
    ('15','16','-'),
    # L16
    ('16','16','-'), # not in original paper
)