In [1]:
import random
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import MolStandardize, AllChem

In [2]:
df_amino = pd.read_csv('../amino_name.csv')

In [3]:
properties = ['IsomericSMILES', 'XLogP']
ls = [[], [], []]
for name in df_amino.name.tolist():
    dic = pcp.get_properties(properties, name, 'name')[0]
    for i, prop in enumerate(dic.keys()):
        ls[i].append(dic[prop])

In [4]:
df = pd.DataFrame()
for i, k in enumerate(dic.keys()):
    df[k] = ls[i]
df['name'] = df_amino.name
df

Unnamed: 0,CID,IsomericSMILES,XLogP,name
0,5950,C[C@@H](C(=O)O)N,-3.0,alanine
1,6322,C(C[C@@H](C(=O)O)N)CN=C(N)N,-4.2,arginine
2,6267,C([C@@H](C(=O)O)N)C(=O)N,-3.4,asparagine
3,5960,C([C@@H](C(=O)O)N)C(=O)O,-2.8,aspartic acid
4,5862,C([C@@H](C(=O)O)N)S,-2.5,cysteine
5,5961,C(CC(=O)N)[C@@H](C(=O)O)N,-3.1,glutamine
6,33032,C(CC(=O)O)[C@@H](C(=O)O)N,-3.7,glutamic acid
7,750,C(C(=O)O)N,-3.2,glycine
8,6274,C1=C(NC=N1)C[C@@H](C(=O)O)N,-3.2,histidine
9,6306,CC[C@H](C)[C@@H](C(=O)O)N,-1.7,isoleucine


In [5]:
mols = [Chem.MolFromSmiles(mol) for mol in df.IsomericSMILES]
normalizer = MolStandardize.normalize.Normalizer()
norm_mols = [normalizer.normalize(mol) for mol in mols]

lfc = MolStandardize.fragment.LargestFragmentChooser()
# 脱塩
mol_desalts = [lfc.choose(mol) for mol in norm_mols]

# 中性化
uc = MolStandardize.charge.Uncharger()
mol_neus = [uc.uncharge(mol) for mol in mol_desalts]

In [6]:
mol_hs = [Chem.AddHs(mol) for mol in mol_neus]

In [7]:
def MMFFOpt(mol_hs: list) -> tuple:
    solid_fail_index_list = []
    opt_fail_index_list = []
    for i, m in enumerate(mol_hs):
        num = AllChem.EmbedMolecule(m, AllChem.ETKDG())
        if num != 0:
            solid_fail_index_list.append(i)
            print("3D Fail {}".format(i))
            continue
        num = AllChem.MMFFOptimizeMolecule(m)
        if num != 0:
            opt_fail_index_list.append(i)
            print("Opt Fail {}".format(i))
    return solid_fail_index_list, opt_fail_index_list

In [8]:
not_solid_index, not_opt_index = MMFFOpt(mol_hs)

In [9]:
len(not_solid_index), len(not_solid_index)

(0, 0)

In [17]:
w = Chem.SDWriter('../amino.sdf')
for m in mol_hs: 
    w.write(m)
w.close()

In [11]:
idx_list = set()

while len(idx_list) < len(df)//2:
    idx_list.add(random.randint(0, len(df)))

In [12]:
idx_list

{6, 7, 9, 11, 13, 14, 15, 16, 18, 20}

In [13]:
df['split'] = ['train' if i in idx_list else 'test' for i in range(len(df))]

In [22]:
df['XLogP'] = df['XLogP'].apply(lambda x: 1 if x > -2.85 else 0)

In [28]:
df.rename(columns={'XLogP': 'y'}, inplace=True)

In [29]:
df.to_csv('../amino.csv', index=False, encoding='utf-8')