In [1]:
!pip install rdkit-pypi


Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected pa

In [2]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
import numpy as np
from mordred import Calculator, descriptors

In [4]:
#reading csv file
df = pd.read_csv("/content/supernatural_filtered.csv")
df.shape

(999, 2)

In [5]:
df.head()

Unnamed: 0,smiles,id
0,Cc1cc(OC2OC(CO)C(O)C(O)C2O)c3C4=C(CCC4)C(=O)Oc3c1,SN0000001
1,COC(=O)CC(c1ccccc1)c2c(O)cc(O)c3C(=O)C=C(Oc23)...,SN0000002
2,COC(CO)CC1OC2C(NC(=O)C(O)C3(CC(=C)C(C)C(C)O3)O...,SN0000003
3,CC(C)=CC(=O)OC1C(=C)C2CCC3(O)C4(C)CCCC(C)(C4CC...,SN0000004
4,COc1ccc(CCNC(=O)C(C)N(c2ccccc2)[S](C)(=O)=O)cc1,SN0000005


In [6]:
#generating canonical smiles
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [8]:
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole


In [10]:
Canon_SMILES = canonical_smiles(df.smiles)
len(Canon_SMILES)

999

In [12]:
#putting the smiles into a dataframe
df['smiles'] = Canon_SMILES
df.head()

Unnamed: 0,smiles,id
0,Cc1cc(OC2OC(CO)C(O)C(O)C2O)c2c3c(c(=O)oc2c1)CCC3,SN0000001
1,COC(=O)CC(c1ccccc1)c1c(O)cc(O)c2c(=O)cc(-c3ccc...,SN0000002
2,C=C1CC(OC)(C(O)C(=O)NC2OCOC3C2OC(CC(CO)OC)C(C)...,SN0000003
3,C=C1C2CCC3(O)C4(C)CCCC(C)(C(=O)O)C4CCC3(C2)C1O...,SN0000004
4,COc1ccc(CCNC(=O)C(C)N(c2ccccc2)S(C)(=O)=O)cc1,SN0000005


In [13]:
#removing duplicate smiles
duplicates_smiles = df[df['smiles'].duplicated()]['smiles'].values
len(duplicates_smiles)

0

In [14]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculating all 200+ descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(df['smiles'])

In [15]:
#converting to a dataframe
df_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.084338,-4.487184,13.084338,1.340220,0.539139,378.377,356.201,378.131468,146,0,...,0,0,0,0,0,0,0,0,0,0
1,13.872055,-4.291161,13.872055,0.925865,0.462888,416.429,396.269,416.125988,156,0,...,0,0,0,0,0,0,0,0,0,0
2,15.180764,-6.114271,15.180764,1.975490,0.375554,517.616,474.272,517.288697,208,0,...,0,0,0,0,0,0,0,0,0,0
3,14.237963,-6.193831,14.237963,2.260778,0.397652,416.558,380.270,416.256274,166,0,...,0,0,0,0,0,0,0,0,0,0
4,13.889533,-6.578433,13.889533,1.229849,0.766403,376.478,352.286,376.145678,140,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,13.531828,-4.551287,13.531828,1.705218,0.575945,264.321,244.161,264.136159,104,0,...,0,0,0,0,0,0,0,0,0,0
995,12.078290,-5.754140,12.078290,1.645100,0.064319,751.127,680.567,750.489311,296,0,...,0,0,0,0,0,0,0,0,0,0
996,16.101432,-6.156204,16.101432,1.404882,0.349787,523.626,486.330,523.257003,204,0,...,0,0,0,0,0,0,0,0,0,0
997,15.587262,-5.610125,15.587262,0.718349,0.206184,815.038,772.702,814.205930,288,0,...,0,0,0,0,0,2,0,0,0,0


In [16]:
#exporting csv file
from google.colab import drive
drive.mount('/content/drive')
df_descriptors.to_csv('/content/drive/My Drive/moldescrp.csv', index=False)

Mounted at /content/drive


In [18]:
df_verify = pd.read_csv('/content/drive/My Drive/moldescrp.csv')
print(df_verify)

     MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0         13.084338       -4.487184          13.084338           1.340220   
1         13.872055       -4.291161          13.872055           0.925865   
2         15.180764       -6.114271          15.180764           1.975490   
3         14.237963       -6.193831          14.237963           2.260778   
4         13.889533       -6.578433          13.889533           1.229849   
..              ...             ...                ...                ...   
994       13.531828       -4.551287          13.531828           1.705218   
995       12.078290       -5.754140          12.078290           1.645100   
996       16.101432       -6.156204          16.101432           1.404882   
997       15.587262       -5.610125          15.587262           0.718349   
998       16.990889       -5.714573          16.990889           1.727442   

          qed    MolWt  HeavyAtomMolWt  ExactMolWt  NumValenceElectrons  \
