In [18]:
import pandas as pd
from io import StringIO
from rdkit import Chem
from collections import Counter

A couple of utility functions

In [16]:
def clear_substructure_match(mol):
    """Clear substructure highlights
    :param mol: input molecule
    :return: None
    """
    mol.__sssAtoms = []
    
def clear_atom_labels(mol):
    """Clear atom labels
    :param mol: input molecule
    :return: None
    """
    label_list = [""]*mol.GetNumAtoms()
    uru.label_atoms(mol, label_list)

Read the input data from an Excel sheet

In [2]:
df = pd.read_excel("dhm_at_tm-1_11_22.xlsx")

In [4]:
df.smile.head()

0              Clc1cc(Cl)cc(Cl)c1
1              Brc1cc(Br)cc(Br)c1
2    ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl
3                  Brc1ccc(Br)cc1
4                  Clc1ccc(Cl)cc1
Name: smile, dtype: object

Define a set of SMARTS rules for matching atom types

In [11]:
rules = """xCH3,[C;$([CH3]A)]
xCH2,[C;$([CH2])]
rCH,[C;$([CR1H1])]
rCH2,[C;$([CR1H2])]
CH#,[C;$([CH1]#*)]
CH2=,[C;$([CH2]=*)]
CH=,[C;$([CH1]([A^3])=A)]
aCH,[c;$([cH1])]
aC,[c;$([cH0])]
yCH,[C;$([CH]([*^2]))]
yCH2,[C;$([CH2]([*^2]))]
yyCH2,[C;$([CH2]([*^2])([*^2]))]
yyCH,[C;$([CH]([A^2])([A^2]))]
yCH3,[C;$([CH3]a)]
aaaC,[c;$(c(c)(c)(c))]
bip,[c;$([cr6]-[cr6])]
"""
rules_csv = StringIO(rules)
rules_df = pd.read_csv(rules_csv,names=["Name","SMARTS"])
rules_df['pat'] = rules_df.SMARTS.apply(Chem.MolFromSmarts)
rules_df

Unnamed: 0,Name,SMARTS,pat
0,xCH3,[C;$([CH3]A)],<rdkit.Chem.rdchem.Mol object at 0x12989c4c0>
1,xCH2,[C;$([CH2])],<rdkit.Chem.rdchem.Mol object at 0x1297ef760>
2,rCH,[C;$([CR1H1])],<rdkit.Chem.rdchem.Mol object at 0x1297ef940>
3,rCH2,[C;$([CR1H2])],<rdkit.Chem.rdchem.Mol object at 0x1298c8dc0>
4,CH#,[C;$([CH1]#*)],<rdkit.Chem.rdchem.Mol object at 0x1298c8c40>
5,CH2=,[C;$([CH2]=*)],<rdkit.Chem.rdchem.Mol object at 0x1298c8ee0>
6,CH=,[C;$([CH1]([A^3])=A)],<rdkit.Chem.rdchem.Mol object at 0x10eaa2dc0>
7,aCH,[c;$([cH1])],<rdkit.Chem.rdchem.Mol object at 0x12ba1d1c0>
8,aC,[c;$([cH0])],<rdkit.Chem.rdchem.Mol object at 0x12ba1d100>
9,yCH,[C;$([CH]([*^2]))],<rdkit.Chem.rdchem.Mol object at 0x12ba1d220>


Define a standard set of atom types

In [26]:
name_list = rules_df.Name
name_dict = dict(zip(name_list,list(name_list.index)))

Define a function to report atom types

In [87]:
def assign_atom_types(mol, name_dict):
    match_list = [""] * mol.GetNumAtoms()
    for name, smarts, pat in rules_df.values:
        matches = [x[0] for x in mol.GetSubstructMatches(pat)]
        for m in matches:
            match_list[m] = name
    clear_substructure_match(mol)
    count_dict = Counter(match_list)
    count_list = [0] * len(name_dict)
    for k,v in count_dict.items():
        if len(k):
            idx = name_dict[k]
            count_list[idx] = v
    return count_list

Batch process molecules from an Excel file and write the results to a csv file.

In [102]:
type_list = []
for smiles in df.smile.values[:100]:
    mol = Chem.MolFromSmiles(smiles)
    type_list.append(assign_atom_types(mol,name_dict))

In [106]:
res_df = pd.DataFrame(type_list,columns=name_list)
res_df

Name,xCH3,xCH2,rCH,rCH2,CH#,CH2=,CH=,aCH,aC,yCH,yCH2,yyCH2,yyCH,yCH3,aaaC,bip
0,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0
2,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,4,2,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,4,2,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,6,4,0,0,0,0,0,0,2
96,0,0,0,0,0,0,0,2,10,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,3,7,0,0,0,0,0,0,2
98,0,0,0,0,0,0,0,5,5,0,0,0,0,0,0,2


In [107]:
res_df.to_csv("results.csv")