In [1]:
import pandas as pd
from io import StringIO
from rdkit import Chem
from collections import Counter

A couple of utility functions

In [2]:
def clear_substructure_match(mol):
    """Clear substructure highlights
    :param mol: input molecule
    :return: None
    """
    mol.__sssAtoms = []
    
def clear_atom_labels(mol):
    """Clear atom labels
    :param mol: input molecule
    :return: None
    """
    label_list = [""]*mol.GetNumAtoms()
    uru.label_atoms(mol, label_list)

Read the input data from an Excel sheet

In [4]:
df = pd.read_excel("https://github.com/PatWalters/az_colab/raw/main/dhv_at_tb-1_11_12.xlsx")

In [5]:
df.smile.head()

0                        C(C(C)C)O
1        C1=CC(=C(C(=C1OC)Cl)Br)Cl
2        C1=CC(=C(C(=C1OC)Cl)Cl)Br
3    C1(=C(C(=C(C=C1OC)Cl)Cl)Cl)Br
4    C1(=C(C=C(C(=C1OC)Cl)Cl)Br)Cl
Name: smile, dtype: object

Define a set of SMARTS rules for matching atom types

In [6]:
rules = """xCH3,[C;$([CH3]A)]
xCH2,[C;$([CH2])]
rCH,[C;$([CR1H1])]
rCH2,[C;$([CR1H2])]
CH#,[C;$([CH1]#*)]
CH2=,[C;$([CH2]=*)]
CH=,[C;$([CH1]([A^3])=A)]
aCH,[c;$([cH1])]
aC,[c;$([cH0])]
yCH,[C;$([CH]([*^2]))]
yCH2,[C;$([CH2]([*^2]))]
yyCH2,[C;$([CH2]([*^2])([*^2]))]
yyCH,[C;$([CH]([A^2])([A^2]))]
yCH3,[C;$([CH3]a)]
aaaC,[c;$(c(c)(c)(c))]
bip,[c;$([cr6]-[cr6])]
"""
rules_csv = StringIO(rules)
rules_df = pd.read_csv(rules_csv,names=["Name","SMARTS"])
rules_df['pat'] = rules_df.SMARTS.apply(Chem.MolFromSmarts)
rules_df

Unnamed: 0,Name,SMARTS,pat
0,xCH3,[C;$([CH3]A)],<rdkit.Chem.rdchem.Mol object at 0x1254b72e0>
1,xCH2,[C;$([CH2])],<rdkit.Chem.rdchem.Mol object at 0x1254b79a0>
2,rCH,[C;$([CR1H1])],<rdkit.Chem.rdchem.Mol object at 0x1254b7160>
3,rCH2,[C;$([CR1H2])],<rdkit.Chem.rdchem.Mol object at 0x1254b7ee0>
4,CH#,[C;$([CH1]#*)],<rdkit.Chem.rdchem.Mol object at 0x1254b70a0>
5,CH2=,[C;$([CH2]=*)],<rdkit.Chem.rdchem.Mol object at 0x1254b7f40>
6,CH=,[C;$([CH1]([A^3])=A)],<rdkit.Chem.rdchem.Mol object at 0x1254b7100>
7,aCH,[c;$([cH1])],<rdkit.Chem.rdchem.Mol object at 0x1254b7220>
8,aC,[c;$([cH0])],<rdkit.Chem.rdchem.Mol object at 0x1254b7e20>
9,yCH,[C;$([CH]([*^2]))],<rdkit.Chem.rdchem.Mol object at 0x1254b7d00>


Define a standard set of atom types

In [7]:
name_list = rules_df.Name
name_dict = dict(zip(name_list,list(name_list.index)))

Define a function to report atom types

In [8]:
def assign_atom_types(mol, name_dict):
    match_list = [""] * mol.GetNumAtoms()
    for name, smarts, pat in rules_df.values:
        matches = [x[0] for x in mol.GetSubstructMatches(pat)]
        for m in matches:
            match_list[m] = name
    clear_substructure_match(mol)
    count_dict = Counter(match_list)
    count_list = [0] * len(name_dict)
    for k,v in count_dict.items():
        if len(k):
            idx = name_dict[k]
            count_list[idx] = v
    return count_list

Make a test dataframe with the first 100 rows 

In [35]:
test_df = df.head(100)

Batch process molecules from the Excel file

In [36]:
type_list = []
for smiles in test_df.smile.values:
    mol = Chem.MolFromSmiles(smiles)
    type_list.append(assign_atom_types(mol,name_dict))

Put all the data togethter

In [37]:
res_df = pd.DataFrame(type_list,columns=name_list)
name_df = pd.DataFrame()
name_df['SMILES'] = test_df.smile
name_df['Name'] = test_df.Name
res_df = pd.concat([name_df,res_df],axis=1)
res_df

Unnamed: 0,SMILES,Name,xCH3,xCH2,rCH,rCH2,CH#,CH2=,CH=,aCH,aC,yCH,yCH2,yyCH2,yyCH,yCH3,aaaC,bip
0,C(C(C)C)O,"2,2-dimethylethanol",2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,C1=CC(=C(C(=C1OC)Cl)Br)Cl,"3-bromo-2,4-dichloroanisole",1,0,0,0,0,0,0,2,4,0,0,0,0,0,0,0
2,C1=CC(=C(C(=C1OC)Cl)Cl)Br,"4-bromo-2,3-dichloroanisole",1,0,0,0,0,0,0,2,4,0,0,0,0,0,0,0
3,C1(=C(C(=C(C=C1OC)Cl)Cl)Cl)Br,"2-bromo-3,4,5-trichloroanisole",1,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0
4,C1(=C(C=C(C(=C1OC)Cl)Cl)Br)Cl,"3-bromo-2,5,6-trichloroanisole",1,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,C(OCc1ccccc1)c2ccccc2,dibenzyl ether,0,0,0,0,0,0,0,10,2,0,2,0,0,0,0,0
96,BrCCc1ccccc1,2-bromoethyl benzene,0,1,0,0,0,0,0,5,1,0,1,0,0,0,0,0
97,CCCc1ccccc1,n-Propylbenzene,1,1,0,0,0,0,0,5,1,0,1,0,0,0,0,0
98,CCNc1ccccc1,N-Ethylbenzenamine,1,0,0,0,0,0,0,5,1,0,1,0,0,0,0,0


Write the data to csv

In [38]:
res_df.to_csv("results.csv")