In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import typing
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from typing import Union, List, Tuple
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import GraphDescriptors
from tqdm import tqdm

def count_atoms(smils: Chem.Mol, atomic_value: int) -> int:
    count = 0
    smilsChromoChem = smils
    for atom in smilsChromoChem.GetAtoms():
        if atom.GetAtomicNum() == atomic_value:
            count += 1
    return count

ion_database = pd.read_csv("data.csv")
only_name = ion_database
print(only_name)
#only_name.drop_duplicates(subset=['Name'],keep='first',inplace=True)
only_name["RDKit_Mol_Class"] = only_name["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))

#Count the number of atoms
for atom, value in tqdm(zip(
    [x.strip() for x in "C, N, O, P, Si, S, F".split(",")],
    [6, 7, 8, 15, 14, 16, 9]
)):
    only_name[f"{atom}_count"] = only_name["RDKit_Mol_Class"].apply(count_atoms, atomic_value=value)

#Delete redundant molecular descriptors   
#only_name_conf = only_name.copy()
#for atom in tqdm([x.strip() for x in "Cl, Br, I, P, B".split(",")]):
#    only_name_conf.drop(only_name_conf[only_name_conf[f"{atom}_count"]>0].index, inplace=True)
#    only_name_conf.drop(columns=f"{atom}_count", inplace=True)
    
#only_name_conf.to_csv('new_data1.csv', encoding='utf8')
only_name.to_csv('new_data1.csv', encoding='utf8')


    SMILES
0     [Mn]
1     [Mn]
2     [Mn]
3     [Mn]
4     [Mn]
..     ...
264    [C]
265    [C]
266    [C]
267    [C]
268    [C]

[269 rows x 1 columns]


7it [00:00, 194.80it/s]


In [2]:
from rdkit.Chem import QED
only_name_conf = only_name.copy()

In [3]:


#QED molecular descriptors
def count_qeds(smils):
    smilsChromoChem = smils
    #qed=QED.properties(Chem.MolFromSmiles(smilsChromoChem))
    #(MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS)=QED.properties(Chem.MolFromSmiles(smilsChromoChem))
    (MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS)=QED.properties(smilsChromoChem)
    return (MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS)

#only_name["QED"] = database.apply(count_qeds)
only_name_conf["QED"] = only_name_conf["RDKit_Mol_Class"].apply(count_qeds)

only_name_conf.to_csv("new_data2.csv",mode='a',index =False)


from rdkit.Chem import rdMolDescriptors

#Count electron number
def count_electron(smils):
    smilsChromoChem = smils
    #Free radical electron number
    re = Descriptors.NumRadicalElectrons(smilsChromoChem)
    #Valence electron number
    ve = Descriptors.NumValenceElectrons(smilsChromoChem)
    #Topological polarity surface area
    TPSA=rdMolDescriptors.CalcTPSA(smilsChromoChem)
    return (re, ve, TPSA)

only_name_conf["re"] = only_name_conf["RDKit_Mol_Class"].apply(count_electron)

only_name_conf.to_csv("new_data3.csv",mode='a',index =False)



In [4]:
from rdkit.ML.Descriptors import MoleculeDescriptors
# from rdkit import DataStructs
#from rdkit.Chem import MACCSkeys
descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
myname='new_data5.csv'
import csv
with open(myname,'w')as f:
    data=csv.writer(f)
    data.writerow(descriptors_names[:len(descriptors_names)])

calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )


def count_descript(smils):
    smilsChromoChem = smils
    descriptors = pd.DataFrame()
    #descriptors = descriptors.append([calculator.CalcDescriptors(Chem.MolFromSmiles('CC[NH+]1CN(C)C=C1'))])
    descriptors = descriptors.append([calculator.CalcDescriptors(smilsChromoChem)])
    return descriptors
    #descriptors.to_csv(myname,mode='a',index =False)

#only_name_conf["my"]=only_name_conf["RDKit_Mol_Class"].apply(count_descript)
#only_name_conf.to_csv(myname,mode='a',index =False)
my =only_name_conf["RDKit_Mol_Class"].apply(count_descript)
my.to_csv(myname,mode='a',index =False)

In [38]:
'''
# from rdkit import DataStructs
#from rdkit.Chem import MACCSkeys
with open(myname,'w')as f:
    data=csv.writer(f)
    data.writerow(descriptors_names[:len(descriptors_names)])

calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )


def count_descript(smils):
    smilsChromoChem = smils
    descriptors = pd.DataFrame()
    #descriptors = descriptors.append([calculator.CalcDescriptors(Chem.MolFromSmiles('CC[NH+]1CN(C)C=C1'))])
    descriptors = descriptors.append([calculator.CalcDescriptors(smilsChromoChem)])
    return descriptors

#index_list = list(map(str,list(range(len(database_smiles["SMILES"])))))
#y = pd.DataFrame(index_list)
#y.index = database_smiles["SMILES"]
#y.columns = ["index"]
#dataset = pd.concat([y, descriptors], axis=1)
'''


In [6]:
import csv
database_smiles = pd.read_csv("new_data_smiles.csv")
from rdkit.ML.Descriptors import MoleculeDescriptors

descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
myname='new_data5.csv'

database_smiles["RDKit_Mol_Class"] = database_smiles["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
descriptors = pd.DataFrame([calculator.CalcDescriptors(mol) for mol in database_smiles["RDKit_Mol_Class"]])
descriptors.columns = descriptors_names
descriptors.index =  database_smiles["SMILES"]
descriptors.to_csv(myname)

In [7]:
import csv
database_smiles = pd.read_csv("new_data_smiles.csv")
from rdkit.ML.Descriptors import MoleculeDescriptors

descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )

database_smiles["RDKit_Mol_Class"] = database_smiles["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
descriptors = pd.DataFrame([calculator.CalcDescriptors(mol) for mol in database_smiles["RDKit_Mol_Class"]])
descriptors.columns = descriptors_names
descriptors.index =  database_smiles["SMILES"]
import numpy as np
idx = np.argwhere(np.all(descriptors.values[..., :] == 0, axis=0))
idx.reshape(1,-1)
for xx in idx:
    xx=eval(str(xx).lstrip('[').rstrip(']'))
    #print(descriptors_names[xx])
    
    descriptors=descriptors.drop(descriptors_names[xx], axis=1,inplace=False)
    
descriptors.to_csv('new_data6.csv')

In [11]:
import csv
database_smiles = pd.read_csv("new_data_smiles.csv")
from rdkit.ML.Descriptors import MoleculeDescriptors

descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )

database_smiles["RDKit_Mol_Class"] = database_smiles["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
descriptors = pd.DataFrame([calculator.CalcDescriptors(mol) for mol in database_smiles["RDKit_Mol_Class"]])
descriptors.columns = descriptors_names
descriptors.index =  database_smiles["SMILES"]
import numpy as np
idx = np.argwhere(np.all(descriptors.values[..., :] == 1, axis=0))
idx.reshape(1,-1)
for xx in idx:
    xx=eval(str(xx).lstrip('[').rstrip(']'))
    #print(descriptors_names[xx])
    
    descriptors=descriptors.drop(descriptors_names[xx], axis=1,inplace=False)
    
descriptors.to_csv('new_data7.csv')