In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import Descriptors

In [2]:
data = pd.read_csv("sampleMolecules.csv")

In [3]:
data.head()

Unnamed: 0,molregNoMin,molregNoCount,chemblSmiles,standardMin,standardMax,standardMean,publishedMain,publishedMax,publishedMean,canonicalSmiles,negativeCharged,positiveCharged,elementsSet,numberOfRings,organicChemistryElementsOnly,smilesLength,encodedSmiles,encodedSmilesPostprocessed
0,1522827,2,CNC(=S)N\N=C(\c1ccc(OC)c(OC)c1)/c2cccc(C)n2,6250.0,6250.0,6250.0,6.25,6.25,6.25,CNC(=S)NN=C(c1ccc(OC)c(OC)c1)c1cccc(C)n1,0.0,0.0,"['S', 'N', 'C', 'O']",2.0,True,40,CNC(=S)NN=C(c1ccc(OC)c(OC)c1)c1cccc(C)n1,CNC(=S)NN=C(c1ccc(OC)c(OC)c1)c1cccc(C)n1Q
1,612901,2,Cn1cc(C2=C(C(=O)NC2=O)c3cccc(NCC(O)CO)c3)c4cc(...,0.4,260.0,130.2,0.4,260.0,130.2,Cn1cc(C2=C(c3cccc(NCC(O)CO)c3)C(=O)NC2=O)c2cc(...,0.0,0.0,"['Cl', 'N', 'C', 'O']",4.0,True,54,Cn1cc(C2=C(c3cccc(NCC(O)CO)c3)C(=O)NC2=O)c2cc(...,Cn1cc(C2=C(c3cccc(NCC(O)CO)c3)C(=O)NC2=O)c2cc(...
2,171018,1,CC(C)c1cccc(C(C)C)c1N2Sc3ncccc3C2=O,12000.0,12000.0,12000.0,12.0,12.0,12.0,CC(C)c1cccc(C(C)C)c1-n1sc2ncccc2c1=O,0.0,0.0,"['S', 'N', 'C', 'O']",3.0,True,36,CC(C)c1cccc(C(C)C)c1-n1sc2ncccc2c1=O,CC(C)c1cccc(C(C)C)c1-n1sc2ncccc2c1=OQ
3,11414,4,O[C@@H](CNCCc1ccc(NS(=O)(=O)c2ccc(I)cc2)cc1)CO...,120.0,170.0,145.0,120.0,170.0,145.0,O=S(=O)(Nc1ccc(CCNCC(O)COc2ccc(O)cc2)cc1)c1ccc...,0.0,0.0,"['N', 'S', 'C', 'O', 'I']",3.0,True,52,O=S(=O)(Nc1ccc(CCNCC(O)COc2ccc(O)cc2)cc1)c1ccc...,O=S(=O)(Nc1ccc(CCNCC(O)COc2ccc(O)cc2)cc1)c1ccc...
4,135826,1,C\C(=C/C\C(=C\CCC1=CC(=O)OC1O)\C)\CCC2=C(C)CCC...,990000.0,990000.0,990000.0,990.0,990.0,990.0,CC(=CCCC1=CC(=O)OC1O)CC=C(C)CCC1=C(C)CCCC1(C)C,0.0,0.0,"['C', 'O']",2.0,True,46,CC(=CCCC1=CC(=O)OC1O)CC=C(C)CCC1=C(C)CCCC1(C)C,CC(=CCCC1=CC(=O)OC1O)CC=C(C)CCC1=C(C)CCCC1(C)CQ


In [4]:
df = pd.DataFrame(data)

* Interesting content is in column: canonicalSmiles
* let's take it to separate list:

In [5]:
smilesCodes = data['canonicalSmiles'].values

In [6]:
smilesCodes[0]

'CNC(=S)NN=C(c1ccc(OC)c(OC)c1)c1cccc(C)n1'

* Proper design of the method

In [7]:
def calculateDescriptors(smilesCode):
    m = Chem.MolFromSmiles(smilesCode)
    return(Descriptors.MolWt(m), Descriptors.NumValenceElectrons(m))

* Exemplary application of abovementioned method:

In [8]:
calculateDescriptors(smilesCodes[0])

(344.44000000000005, 126)

* it works properly, provides molecular weight and number of valence electrons for given smilesCode

* let's allply the method for all the SMILES codes

In [9]:
results = []
for item in smilesCodes:
    descriptors = calculateDescriptors(item)
    results.append([item, descriptors[0], descriptors[1]])

* we have collected all the results on list results

In [10]:
resultsDF = pd.DataFrame(results)

In [11]:
resultsDF.columns = ['smilesCode', 'moleculaWeight', 'numberOfValEceltrons']

* and the final results goes here:

In [12]:
resultsDF

Unnamed: 0,smilesCode,moleculaWeight,numberOfValEceltrons
0,CNC(=S)NN=C(c1ccc(OC)c(OC)c1)c1cccc(C)n1,344.44,126
1,Cn1cc(C2=C(c3cccc(NCC(O)CO)c3)C(=O)NC2=O)c2cc(...,425.872,154
2,CC(C)c1cccc(C(C)C)c1-n1sc2ncccc2c1=O,312.438,114
3,O=S(=O)(Nc1ccc(CCNCC(O)COc2ccc(O)cc2)cc1)c1ccc...,568.433,170
4,CC(=CCCC1=CC(=O)OC1O)CC=C(C)CCC1=C(C)CCCC1(C)C,372.549,150
5,CNC(=O)C1(C)CCC2(C)CCC3(C)C4=CC=C5C(=CC(=O)C(O...,463.662,184
6,C#Cc1cncc(-c2ccsc2)c1,185.251,62
7,C=C(C)C(O)C(O)CC(C)C1=C2C=CC3C4(C)CCC(=O)C(C)(...,468.678,188
8,CNCCCOc1c(Br)cc(-c2c[nH]c(=O)c(Cc3c[nH]c4ccccc...,546.263,160
