In [3]:
%%time
import pandas as pd

train = pd.read_csv('o3f.train.csv')
test = pd.read_csv('o3f.test.csv')
print(len(train), len(test))

907213 226803
CPU times: user 5.49 s, sys: 860 ms, total: 6.35 s
Wall time: 7.37 s


In [4]:
%%time
from multiprocessing import pool
from rdkit.Chem import AllChem as Chem

with pool.ThreadPool(4) as ppool:
    mols = ppool.map(lambda s : Chem.MolFromSmiles(s), train['smiles'][:4096])
print(mols[:3])

[<rdkit.Chem.rdchem.Mol object at 0x7f1d464f0d00>, <rdkit.Chem.rdchem.Mol object at 0x7f1d464f0cb0>, <rdkit.Chem.rdchem.Mol object at 0x7f1d464f0da0>]
CPU times: user 950 ms, sys: 28.9 ms, total: 978 ms
Wall time: 1.04 s


In [6]:
dict(a=2)

{'a': 2}

In [15]:
[i for i in Chem.__dict__ if 'Calc' in i]

['CalcRMS',
 'CalcCrippenDescriptors',
 'CalcLabuteASA',
 'CalcTPSA',
 'CalcExactMolWt',
 'CalcMolFormula',
 'CalcNumLipinskiHBD',
 'CalcNumLipinskiHBA',
 'CalcNumHBD',
 'CalcNumHBA',
 'CalcNumRotatableBonds',
 'CalcNumRings',
 'CalcNumAromaticRings',
 'CalcNumSaturatedRings',
 'CalcNumHeterocycles',
 'CalcNumAromaticHeterocycles',
 'CalcNumAromaticCarbocycles',
 'CalcNumSaturatedHeterocycles',
 'CalcNumSaturatedCarbocycles',
 'CalcNumAliphaticRings',
 'CalcNumAliphaticHeterocycles',
 'CalcNumAliphaticCarbocycles',
 'CalcNumHeteroatoms',
 'CalcNumAmideBonds',
 'CalcFractionCSP3',
 'CalcChiNv',
 'CalcChi0v',
 'CalcChi1v',
 'CalcChi2v',
 'CalcChi3v',
 'CalcChi4v',
 'CalcChiNn',
 'CalcChi0n',
 'CalcChi1n',
 'CalcChi2n',
 'CalcChi3n',
 'CalcChi4n',
 'CalcHallKierAlpha',
 'CalcKappa1',
 'CalcKappa2',
 'CalcKappa3',
 'CalcNumSpiroAtoms',
 'CalcNumBridgeheadAtoms',
 'CalcNumAtomStereoCenters',
 'CalcNumUnspecifiedAtomStereoCenters',
 'CalcCoulombMat',
 'CalcEEMcharges',
 'CalcWHIM',
 'CalcGET

In [13]:
s0=train['smiles'][0]
s0

'C[C@]1(CS(=O)(=O)[C@@](C)(C2CC2)C(N)=N1)c1cc(Nc2ncnc3cc(Br)cnc23)ncc1F'

In [None]:
%%time

fns = ['CalcExactMolWt', 'CalcNumLipinskiHBD', 'CalcNumLipinskiHBA', 'CalcNumHBD', 'CalcNumHBA', 
       'CalcNumRotatableBonds', 'CalcNumRings', 'CalcNumAromaticRings', 'CalcNumSaturatedRings', 
       'CalcNumHeterocycles', 'CalcNumAromaticHeterocycles', 'CalcNumAromaticCarbocycles', 'CalcNumSaturatedHeterocycles', 
       'CalcNumSaturatedCarbocycles', 'CalcNumAliphaticRings', 'CalcNumAliphaticHeterocycles', 'CalcNumAliphaticCarbocycles', 
       'CalcNumHeteroatoms', 'CalcNumAmideBonds', 'CalcFractionCSP3', 
       'CalcChi1v', 'CalcChi2v', 'CalcChi3v', 'CalcChi4v']  

def ft_smiles(smiles):
    m = Chem.MolFromSmiles(smiles)
    mh = Chem.AddHs(m)
    #d = {i.replace('Calc',''):Chem.__dict__[i](mh) for i in fns}
    d = {}
    for i in fns:
        try:
            d[i] = Chem.__dict__[i](mh)
        except:
            pass
    return d

test_props = pd.DataFrame(test['smiles'].apply(ft_smiles).to_list())
test_props['smiles'] = test['smiles']
test_props.to_csv('test_props.csv', index=False)

train_props = pd.DataFrame(train['smiles'].apply(ft_smiles).to_list())
train_props['smiles'] = train['smiles']
train_props.to_csv('train_props.csv', index=False)