In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
data = pd.read_csv('GABAA_data.csv',encoding = 'gb2312')

In [3]:
data

Unnamed: 0,name,smiles,source,class
0,"1-[(3-{6-chloroimidazo[1,2-a]pyridin-2-yl}benz...",Clc1ccc2nc(cn2c1)-c1cccc(c1)S(=O)(=O)N1CCCCCC1,Topscience,GABAA agonist
1,"1-[(3-{8-methylimidazo[1,2-a]pyridin-2-yl}benz...",Cc1cccn2cc(nc12)-c1cccc(c1)S(=O)(=O)N1CCCCCC1,Topscience,GABAA agonist
2,"3-(4-chlorophenyl)-6-(3,4,5-trimethoxyphenyl)-...",COc1cc(cc(OC)c1OC)-c1nn2c(nnc2s1)-c1ccc(Cl)cc1,Topscience,GABAA agonist
3,"2-[2-(4-chlorophenyl)imidazo[1,2-a]pyridin-3-y...",OC(=O)Cc1c(nc2ccccn12)-c1ccc(Cl)cc1,Topscience,GABAA agonist
4,"2-[10-(5-chloro-2-methylphenyl)-3,4,6,8,10,11-...",Cc1ccc(Cl)cc1-n1ncc2c1ncn1c(nnc21)-c1ccccc1O,Topscience,GABAA agonist
...,...,...,...,...
1246,Tetrodotoxin,C(C1(C2C3C(N=C(NC34C(C1OC(C4O)(O2)O)O)N)O)O)O,Literature,GABAA antagonist
1247,Pentetrazol,C1CCC2=NN=NN2CC1,Literature,GABAA antagonist
1248,Salicylidene Salicylhydrazide,C1=CC=C(C(=C1)/C=N/NC(=O)C2=CC=CC=C2O)O,Literature,GABAA antagonist
1249,RU5135,C[C@]12CC[C@H](C[C@H]1CC[C@@H]3[C@@H]2C(=O)C[C...,Literature,GABAA antagonist


# 标准化

In [4]:
gabaa_smiles = data['smiles']

In [5]:
def standardize_smi(smiles,basicClean=True,clearCharge=True, clearFrag=True, canonTautomer=True, isomeric=False):
    try:
         clean_mol = Chem.MolFromSmiles(smiles)
        # 除去氢、金属原子、标准化分子
         if basicClean:
            clean_mol = rdMolStandardize.Cleanup(clean_mol) 
        # 仅保留主要片段作为分子
#          if clearFrag:
#             clean_mol = rdMolStandardize.FragmentParent(clean_mol)
        # 尝试中性化处理分子
         if clearCharge:
            uncharger = rdMolStandardize.Uncharger() 
            clean_mol = uncharger.uncharge(clean_mol)
        # 处理互变异构情形，这一步在某些情况下可能不够完美
         if canonTautomer:
            te = rdMolStandardize.TautomerEnumerator() # idem
            clean_mol = te.Canonicalize(clean_mol)
        #set to True 保存立体信息，set to False 移除立体信息，并将分子存为标准化后的SMILES形式
         stan_smiles=Chem.MolToSmiles(clean_mol, isomericSmiles=isomeric)
    except Exception as e:
        print (e, smiles)
        return None
    return stan_smiles

In [6]:
stan_smiles_list = []
for smiles in gabaa_smiles:
    stan_smiles = standardize_smi(smiles)
    stan_smiles_list.append(stan_smiles)

In [7]:
stan_df = pd.DataFrame({'name':data['name'],'smiles':stan_smiles_list,'source':data['source'],'class':data['class']})

In [8]:
stan_df['class'] = stan_df['class'].replace({'GABAA agonist': 1, 'GABAA antagonist': 0})

In [9]:
counts = stan_df['class'].value_counts()

print('Number of 1s:', counts[1])
print('Number of 0s:', counts[0])

Number of 1s: 609
Number of 0s: 642


In [10]:
stan_df

Unnamed: 0,name,smiles,source,class
0,"1-[(3-{6-chloroimidazo[1,2-a]pyridin-2-yl}benz...",O=S(=O)(c1cccc(-c2cn3cc(Cl)ccc3n2)c1)N1CCCCCC1,Topscience,1
1,"1-[(3-{8-methylimidazo[1,2-a]pyridin-2-yl}benz...",Cc1cccn2cc(-c3cccc(S(=O)(=O)N4CCCCCC4)c3)nc12,Topscience,1
2,"3-(4-chlorophenyl)-6-(3,4,5-trimethoxyphenyl)-...",COc1cc(-c2nn3c(-c4ccc(Cl)cc4)nnc3s2)cc(OC)c1OC,Topscience,1
3,"2-[2-(4-chlorophenyl)imidazo[1,2-a]pyridin-3-y...",O=C(O)Cc1c(-c2ccc(Cl)cc2)nc2ccccn12,Topscience,1
4,"2-[10-(5-chloro-2-methylphenyl)-3,4,6,8,10,11-...",Cc1ccc(Cl)cc1-n1ncc2c1ncn1c(-c3ccccc3O)nnc21,Topscience,1
...,...,...,...,...
1246,Tetrodotoxin,NC1=NC(O)C2C3OC4(O)OC(C(O)C2(N1)C4O)C3(O)CO,Literature,0
1247,Pentetrazol,C1CCc2nnnn2CC1,Literature,0
1248,Salicylidene Salicylhydrazide,O=C(NN=Cc1ccccc1O)c1ccccc1O,Literature,0
1249,RU5135,CC12CC(=O)C3C(CCC4CC(O)CCC43C)C1CC(=N)N2,Literature,0


In [12]:
stan_df.to_csv('stan_data.csv')