# Basic imports

In [1]:
import pandas as pd
import numpy as np

# Loading datasets

In [2]:
cns_smiles_df=pd.read_csv('cns_smiles.txt',sep='\t')
non_cns_smiles_df=pd.read_csv('non_cns_smiles.txt',sep='\t')

In [3]:
cns_smiles_df.head()

Unnamed: 0,Name,Smiles
0,ACETAZOLAMIDE,CC(=O)Nc1nnc(s1)S(=O)(=O)N
1,ACETOPHENAZINE,CC(=O)c1ccc2c(c1)N(c3ccccc3S2)CCCN4CCN(CC4)CCO
2,ALFENTANIL,CCC(=O)N(c1ccccc1)C2(CCN(CC2)CCn3c(=O)n(nn3)CC...
3,ALPRAZOLAM,Cc1nnc2n1-c3ccc(cc3C(=NC2)c4ccccc4)Cl
4,AMANTADINE,C1C2CC3CC1CC(C2)(C3)N


In [4]:
non_cns_smiles_df.head()

Unnamed: 0,Name,Smiles
0,ABACAVIR_SULFATE,Nc1nc(NC2CC2)c3ncn(C4CC(CO)C=C4)c3n1
1,ACARBOSE,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...
2,ACEBUTOLOL,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
3,ACECAINIDE,CCN(CC)CCNC(=O)c1ccc(NC(=O)C)cc1
4,ACECLOFENAC,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl


# Empty dataframe

In [5]:
labels=["m_name","n_atoms_without_Hydrogen","n_atoms_with_Hydrogen","m_weight","m_avg_weigth","m_weigth_without_Hydrogen","n_valence_electrons",
        "n_aliphatic_carbocycles","n_aliphatic_heterocycles","n_aliphatic_rings","n_amide_bonds",
        "n_aromatic_carbocycles","n_aromatic_heterocycles","n_aromatic_rings",
        "n_saturated_carbocycles","n_saturated_heterocycles","n_saturated_rings","n_HBA","n_HBD",
        "n_hetero_atoms","n_hetero_cycles","n_rings","n_strict_rotable_bonds","n_non_strict_rotable_bonds",
         "n_primary_carbon_atoms","n_HOH","n_O",
        "n_briged_head_atoms","n_atoms_stereo_centers","n_atoms_unspecified_stereo_centers","n_spiro_atoms",
        "m_logp","m_mr","fraction_CSP3"]

# rdkit imports

In [6]:
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors



# Getting all the features

#### n means number, m means molecule

In [7]:
def getFeatures(m):
    
    rslt=[] # rslt list initialization
        
    def getCarbonAtoms(m): #sub method  
        patt=Chem.MolFromSmarts("[#6]")
        n_primary_carbon_atoms = len(m.GetSubstructMatches(patt))
    
        return n_primary_carbon_atoms    
    
    if m is not None:
        n_atoms_without_Hydrogen=m.GetNumAtoms()
        m2=Chem.AddHs(m)
        n_atoms_with_Hydrogen=m2.GetNumAtoms()

        m_weigth=Chem.Descriptors.ExactMolWt(m)
        m_avg_weigth=Chem.Descriptors.HeavyAtomMolWt(m)
        m_weigth_without_Hydrogen=Chem.Descriptors.MolWt(m)

        n_valence_electrons=Chem.Descriptors.NumValenceElectrons(m)

        n_aliphatic_carbocycles=Chem.rdMolDescriptors.CalcNumAliphaticCarbocycles(m)
        n_aliphatic_heterocycles=Chem.rdMolDescriptors.CalcNumAliphaticHeterocycles(m)
        n_aliphatic_rings=Chem.rdMolDescriptors.CalcNumAliphaticRings(m)    
        n_amide_bonds=Chem.rdMolDescriptors.CalcNumAmideBonds(m)

        n_aromatic_carbocycles=Chem.rdMolDescriptors.CalcNumAromaticCarbocycles(m)
        n_aromatic_heterocycles=Chem.rdMolDescriptors.CalcNumAromaticHeterocycles(m)
        n_aromatic_rings=Chem.rdMolDescriptors.CalcNumAromaticRings(m)

        n_saturated_carbocycles=Chem.rdMolDescriptors.CalcNumSaturatedCarbocycles(m)
        n_saturated_heterocycles=Chem.rdMolDescriptors.CalcNumSaturatedHeterocycles(m)
        n_saturated_rings=Chem.rdMolDescriptors.CalcNumSaturatedRings(m)

        n_HBA=Chem.rdMolDescriptors.CalcNumHBA(m)
        n_HBD=Chem.rdMolDescriptors.CalcNumHBD(m)

        n_hetero_atoms=Chem.rdMolDescriptors.CalcNumHeteroatoms(m)
        n_hetero_cycles=Chem.rdMolDescriptors.CalcNumHeterocycles(m)
        n_rings=Chem.rdMolDescriptors.CalcNumRings(m)
        n_strict_rotable_bonds=Chem.rdMolDescriptors.CalcNumRotatableBonds(m,True)
        n_non_strict_rotable_bonds=Chem.rdMolDescriptors.CalcNumRotatableBonds(m,False)
        
        n_primary_carbon_atoms=getCarbonAtoms(m)
        n_HOH=Chem.Descriptors.NHOHCount(m)
        n_O=Chem.Descriptors.NOCount(m)
        n_briged_head_atoms=Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(m,None)
        n_atoms_stereo_centers=Chem.rdMolDescriptors.CalcNumAtomStereoCenters(m)
        n_atoms_unspecified_stereo_centers=Chem.rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(m)
        n_spiro_atoms=Chem.rdMolDescriptors.CalcNumSpiroAtoms(m)
        
        m_logp=Chem.Descriptors.MolLogP(m)
        m_mr=Chem.Descriptors.MolMR(m)
        fraction_CSP3=Chem.rdMolDescriptors.CalcFractionCSP3(m)
        
    rslt=[n_atoms_without_Hydrogen,n_atoms_with_Hydrogen,m_weigth,m_avg_weigth,m_weigth_without_Hydrogen, n_valence_electrons,           
           n_aliphatic_carbocycles,n_aliphatic_heterocycles,n_aliphatic_rings,n_amide_bonds,
          n_aromatic_carbocycles,n_aromatic_heterocycles,n_aromatic_rings,
          n_saturated_carbocycles,n_saturated_heterocycles,n_saturated_rings,n_HBA,n_HBD,
         n_hetero_atoms,n_hetero_cycles,n_rings,n_strict_rotable_bonds,n_non_strict_rotable_bonds,
         n_primary_carbon_atoms,n_HOH,n_O,
          n_briged_head_atoms,n_atoms_stereo_centers,n_atoms_unspecified_stereo_centers,n_spiro_atoms,
          m_logp,m_mr,fraction_CSP3]
    
    return rslt 

In [8]:
def getDataList(df):
    data_list=[]
    
    for m_name,m_smile in zip(df["Name"].values,df["Smiles"].values):
        m = Chem.MolFromSmiles(m_smile)                        
        data=getFeatures(m)
        data.insert(0,m_name)
        data_list.append(data)
        
    return data_list

### Data as a list

In [9]:
cns_data=getDataList(cns_smiles_df)
non_cns_data=getDataList(non_cns_smiles_df)

### This is how data is storaged in our list

In [10]:
sub_lst=cns_data[0]
for l,i in zip(labels,sub_lst):
    print("{}: {}".format(l,i))

m_name: ACETAZOLAMIDE
n_atoms_without_Hydrogen: 13
n_atoms_with_Hydrogen: 19
m_weight: 221.988132052
m_avg_weigth: 216.203
m_weigth_without_Hydrogen: 222.251
n_valence_electrons: 72
n_aliphatic_carbocycles: 0
n_aliphatic_heterocycles: 0
n_aliphatic_rings: 0
n_amide_bonds: 1
n_aromatic_carbocycles: 0
n_aromatic_heterocycles: 1
n_aromatic_rings: 1
n_saturated_carbocycles: 0
n_saturated_heterocycles: 0
n_saturated_rings: 0
n_HBA: 6
n_HBD: 2
n_hetero_atoms: 9
n_hetero_cycles: 1
n_rings: 1
n_strict_rotable_bonds: 2
n_non_strict_rotable_bonds: 3
n_primary_carbon_atoms: 4
n_HOH: 3
n_O: 7
n_briged_head_atoms: 0
n_atoms_stereo_centers: 0
n_atoms_unspecified_stereo_centers: 0
n_spiro_atoms: 0
m_logp: -0.8561000000000003
m_mr: 45.5859
fraction_CSP3: 0.25


### Creating new dataframe from scratch

In [11]:
def createDataFrameFromScracht(data,labels):
    df=pd.DataFrame(columns=labels)
    for row in data:
        new_row={}
        for cnt,x in enumerate(row):
            new_row[labels[cnt]]=x
        df=df.append(new_row,ignore_index=True)
    return df

In [12]:
cns_df=createDataFrameFromScracht(cns_data,labels)
non_cns_df=createDataFrameFromScracht(non_cns_data,labels)

# From dataframe to CSV

In [13]:
cns_df.to_csv('cns_molecules.csv', sep="\t", index = False, header=True)
non_cns_df.to_csv('non_cns_molecules.csv', sep="\t", index = False, header=True)