In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/training_smiles.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
1,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0
2,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0
3,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0
4,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0
5,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0


In [4]:
len(df)

156258

In [5]:
test = pd.read_csv('../data/test_smiles.csv', index_col=0)

In [6]:
len(test)

52086

In [7]:
test.head()

Unnamed: 0_level_0,SMILES
INDEX,Unnamed: 1_level_1
156259,COCCCNc1ncnc2c1cnn2-c1ccc(C)cc1C
156260,Cc1cccc(Nc2nnc(SCC(=O)NCc3cccs3)s2)c1C
156261,O=C1/C(=C/c2cccnc2)CC/C1=C\c1cccnc1
156262,CC(C)(C)NC(=O)COC(=O)c1ccc(NC(=O)CC#N)cc1
156263,O=C(/C=C/c1ccco1)Nc1ccc(Cl)c(S(=O)(=O)N2CCOCC2)c1


In [75]:
df['ACTIVE'].value_counts()

0.0    154528
1.0      1730
Name: ACTIVE, dtype: int64

In [8]:
from rdkit import Chem
import rdkit.Chem.rdMolDescriptors as d

In [30]:
def MolFromSmiles(smile):
    try:
        return Chem.MolFromSmiles(smile)
    except:
        return np.nan

def GetNumAtoms(mol) -> int:
    try:
        return mol.GetNumAtoms()
    except:
        return np.nan

def ExactMolWt(mol) ->float:
    try:
        return d.CalcExactMolWt(mol)
    except:
        return np.nan

def CalcAsphericity(mol) -> float:
    try:
        return d.CalcAsphericity(mol)
    except:
        return np.nan

def CalcChi0n(mol) -> float:
    try:
        return d.CalcChi0n(mol)
    except:
        return np.nan

def CalcChi0v(mol) -> float:
    try:
        return d.CalcChi0v(mol)
    except:
        return np.nan

def CalcChi1n(mol) -> float:
    try:
        return d.CalcChi1n(mol)
    except:
        return np.nan

def CalcChi1v(mol) -> float:
    try:
        return d.CalcChi1v(mol)
    except:
        return np.nan

def CalcChi2n(mol) -> float:
    try:
        return d.CalcChi2n(mol)
    except:
        return np.nan

def CalcChi2v(mol) -> float:
    try:
        return d.CalcChi2v(mol)
    except:
        return np.nan

def CalcChi3n(mol) -> float:
    try:
        return d.CalcChi3n(mol)
    except:
        return np.nan

def CalcChi3v(mol) -> float:
    try:
        return d.CalcChi3v(mol)
    except:
        return np.nan


def CalcChi4n(mol) -> float:
    try:
        return d.CalcChi4n(mol)
    except:
        return np.nan


def CalcChi4v(mol) -> float:
    try:
        return d.CalcChi4v(mol)
    except:
        return np.nan


def CalcEccentricity(mol) -> float:
    try:
        return d.CalcEccentricity(mol)
    except:
        return np.nan


def CalcFractionCSP3(mol) -> float:
    try:
        return d.CalcFractionCSP3(mol)
    except:
        return np.nan


def CalcHallKierAlpha(mol) -> float:
    try:
        return d.CalcHallKierAlpha(mol)
    except:
        return np.nan

def CalcInertialShapeFactor(mol) -> float:
    try:
        return d.CalcInertialShapeFactor(mol)
    except:
        return np.nan

def CalcKappa1(mol) -> float:
    try:
        return d.CalcKappa1(mol)
    except:
        return np.nan

def CalcKappa2(mol) -> float:
    try:
        return d.CalcKappa2(mol)
    except:
        return np.nan

def CalcKappa3(mol) -> float:
    try:
        return d.CalcKappa3(mol)
    except:
        return np.nan

def CalcLabuteASA(mol) -> float:
    try:
        return d.CalcLabuteASA(mol)
    except:
        return np.nan


def CalcMORSE(mol) -> float:
    try:
        return d.CalcMORSE(mol)
    except:
        return np.nan

def CalcMolFormula(mol) -> str:
    try:
        return d.CalcMolFormula(mol)
    except:
        return np.nan

def CalcNPR1(mol) -> float:
    try:
        return d.CalcNPR1(mol)
    except:
        return np.nan

def CalcNPR2(mol) -> float:
    try:
        return d.CalcNPR2(mol)
    except:
        return np.nan


def CalcNumAliphaticCarbocycles(mol) -> int:
    try:
        return d.CalcNumAliphaticCarbocycles(mol)
    except:
        return np.nan

def CalcNumAliphaticHeterocycles(mol) -> int:
    try:
        return d.CalcNumAliphaticHeterocycles(mol)
    except:
        return np.nan

def CalcNumAliphaticRings(mol) -> int:
    try:
        return d.CalcNumAliphaticRings(mol)
    except:
        return np.nan

def CalcNumAmideBonds(mol) -> int:
    try:
        return d.CalcNumAmideBonds(mol)
    except:
        return np.nan

def CalcNumAromaticCarbocycles(mol) -> int:
    try:
        return d.CalcNumAromaticCarbocycles(mol)
    except:
        return np.nan

def CalcNumAromaticHeterocycles(mol) -> int:
    try:
        return d.CalcNumAromaticHeterocycles(mol)
    except: 
        return np.nan

def CalcNumAromaticRings(mol) -> int:
    try:
        return d.CalcNumAromaticRings(mol)
    except:
        return np.nan
def CalcNumAtomStereoCenters(mol) -> int:
    try:
        return d.CalcNumAtomStereoCenters(mol)
    except:
        return np.nan

def CalcNumAtoms(mol) -> int:
    try:
        return d.CalcNumAtoms(mol)
    except:
        return np.nan

def CalcNumBridgeheadAtoms(mol) -> int:
    try:
        return d.CalcNumBridgeheadAtoms(mol)
    except:
        return np.nan

def CalcNumHBA(mol) -> int:
    try:
        return d.CalcNumHBA(mol)
    except:
        return np.nan

def CalcNumHBD(mol) -> int:
    try:
        return d.CalcNumHBD(mol)
    except:
        return np.nan


def CalcNumHeavyAtoms(mol) -> int:
    try:
        return d.CalcNumHeavyAtoms(mol)
    except:
        return np.nan

def CalcNumHeteroatoms(mol) -> int:
    try:
        return d.CalcNumHeteroatoms(mol)
    except:
        return np.nan


def CalcNumHeterocycles(mol) -> int:
    try:
        return d.CalcNumHeterocycles(mol)
    except:
        return np.nan


def CalcNumLipinskiHBA(mol) -> int:
    try:
        return d.CalcNumLipinskiHBA(mol)
    except:
        return np.nan

def CalcNumLipinskiHBD(mol) -> int:
    try:
        return d.CalcNumLipinskiHBD(mol)
    except:
        return np.nan


def CalcNumRings(mol) -> int:
    try:
        return d.CalcNumRings(mol)
    except:
        return np.nan

def CalcNumRotatableBonds(mol) -> int:
    try:
        return d.CalcNumRotatableBonds(mol)
    except:
        return np.nan


def CalcNumSaturatedCarbocycles(mol) -> int:
    try:
        return d.CalcNumSaturatedCarbocycles(mol)
    except:
        return np.nan

def CalcNumSaturatedHeterocycles(mol) -> int:
    try:
        return d.CalcNumSaturatedHeterocycles(mol)
    except:
        return np.nan

def CalcNumSaturatedRings(mol) -> int:
    try:
        return d.CalcNumSaturatedRings(mol)
    except:
        return np.nan


def CalcNumSpiroAtoms(mol) -> int:
    try:
        return d.CalcNumSpiroAtoms(mol)
    except:
        return np.nan

def CalcNumUnspecifiedAtomStereoCenters(mol) -> int:
    try:
        return d.CalcNumUnspecifiedAtomStereoCenters(mol)
    except:
        return np.nan


def CalcPBF(mol) -> float:
    try:
        return d.CalcPBF(mol)
    except:
        return np.nan


def CalcPMI1(mol) -> float:
    try:
        return d.CalcPMI1(mol)
    except:
        return np.nan

def CalcPMI2(mol) -> float:
    try:
        return d.CalcPMI2(mol)
    except:
        return np.nan

def CalcPMI3(mol) -> float:
    try:
        return d.CalcPMI3(mol)
    except:
        return np.nan


def CalcPhi(mol) -> float:
    try:
        return d.CalcPhi(mol)
    except:
        return np.nan

def CalcRDF(mol) -> float:
    try:
        return d.CalcRDF(mol)
    except:
        return np.nan

def CalcRadiusOfGyration(mol) -> float:
    try:
        return d.CalcRadiusOfGyration(mol)
    except:
        return np.nan

def CalcSpherocityIndex(mol) -> float:
    try:
        return d.CalcSpherocityIndex(mol)
    except:
        return np.nan

def CalcTPSA(mol) -> float:
    try:
        return d.CalcTPSA(mol)
    except:
        return np.nan

def CalcWHIM(mol) -> float:
    try:
        return d.CalcWHIM(mol)
    except:
        return np.nan

In [78]:
basic_features = [ 
    GetNumAtoms,
    ExactMolWt,
    CalcAsphericity,
    CalcChi0n,
    CalcChi0v,
    CalcChi1n,
    CalcChi1v,
    CalcChi2n,
    CalcChi2v,
    CalcChi3n,
    CalcChi3v,
    CalcChi4n,
    CalcChi4v,
    CalcEccentricity,
    CalcFractionCSP3,
    CalcHallKierAlpha,
    CalcInertialShapeFactor,
    CalcKappa1,
    CalcKappa2,
    CalcKappa3,
    CalcLabuteASA,
    CalcMORSE,
    CalcMolFormula,
    CalcNPR1,
    CalcNPR2,
    CalcNumAliphaticCarbocycles,
    CalcNumAliphaticHeterocycles,
    CalcNumAliphaticRings,
    CalcNumAmideBonds,
    CalcNumAromaticCarbocycles,
    CalcNumAromaticHeterocycles,
    CalcNumAromaticRings,
    CalcNumAtomStereoCenters,
    CalcNumAtoms,
    CalcNumBridgeheadAtoms,
    CalcNumHBA,
    CalcNumHBD,
    CalcNumHeavyAtoms,
    CalcNumHeteroatoms,
    CalcNumHeterocycles,
    CalcNumLipinskiHBA,
    CalcNumLipinskiHBD,
    CalcNumRings,
    CalcNumRotatableBonds,
    CalcNumSaturatedCarbocycles,
    CalcNumSaturatedHeterocycles,
    CalcNumSaturatedRings,
    CalcNumSpiroAtoms,
    CalcNumUnspecifiedAtomStereoCenters,
    CalcPBF,
    CalcPMI1,
    CalcPMI2,
    CalcPMI3,
    CalcPhi,
    CalcRDF,
    CalcRadiusOfGyration,
    CalcSpherocityIndex,
    CalcTPSA,
    CalcWHIM,
]

In [81]:
df_head = df.head().copy()

In [80]:
for feature in basic_features:
    print(feature.__name__)
    df_head[feature.__name__] = df_head['MolFromSmiles'].apply(feature)

GetNumAtoms


KeyError: 'MolFromSmiles'

In [38]:
MolFromSmiles.__name__

'MolFromSmiles'

In [31]:
df[MolFromSmiles.__name__] = df['SMILES'].apply(MolFromSmiles)



In [163]:
def BCUT2D(mol):
    try:
        return d.BCUT2D(mol[0])
    except:
        return [np.nan]*8

In [170]:
def CalcAUTOCORR2D(mol):
    try:
        return d.CalcAUTOCORR2D(mol[0])
    except:
        return [np.nan]*192

In [173]:
def CalcAUTOCORR3D(mol):
    try:
        return d.CalcAUTOCORR3D(mol[0])
    except:
        return np.nan

In [176]:
def CalcCoulombMat(mol):
    try:
        return d.CalcCoulombMat(mol[0])
    except:
        return np.nan

In [184]:
def CalcEEMcharges(mol):
    try:
        return d.CalcEEMcharges(mol[0])
    except:
        return np.nan

In [160]:
(df['MolFromSmiles'].to_frame())

Unnamed: 0_level_0,MolFromSmiles
INDEX,Unnamed: 1_level_1
1,<rdkit.Chem.rdchem.Mol object at 0x0000024BAAB...
2,<rdkit.Chem.rdchem.Mol object at 0x0000024BAAB...
3,<rdkit.Chem.rdchem.Mol object at 0x0000024C654...
4,<rdkit.Chem.rdchem.Mol object at 0x0000024C654...
5,<rdkit.Chem.rdchem.Mol object at 0x0000024C654...
...,...
156254,<rdkit.Chem.rdchem.Mol object at 0x0000024C646...
156255,<rdkit.Chem.rdchem.Mol object at 0x0000024C646...
156256,<rdkit.Chem.rdchem.Mol object at 0x0000024C646...
156257,<rdkit.Chem.rdchem.Mol object at 0x0000024C646...


In [164]:
df_BCUT2D = df['MolFromSmiles'].to_frame().apply(BCUT2D, axis = 1, result_type = 'expand')

In [165]:
df_BCUT2D.rename(lambda x: 'BCUT2D_' + str(x), axis = 1, inplace = True)

In [167]:
df_BCUT2D

Unnamed: 0_level_0,BCUT2D_0,BCUT2D_1,BCUT2D_2,BCUT2D_3,BCUT2D_4,BCUT2D_5,BCUT2D_6,BCUT2D_7
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16.140365,10.177342,2.317179,-2.222381,2.265224,-2.317194,6.233673,-0.113244
2,32.133540,10.069004,2.203357,-2.083512,2.366658,-2.027544,7.209717,0.103803
3,16.535517,10.075537,2.112934,-2.056819,2.081497,-2.162756,5.974898,-0.142005
4,32.233427,10.209864,2.246205,-2.253552,2.213854,-2.406944,7.887337,-0.117826
5,16.507765,10.022730,2.128110,-2.201005,2.319613,-2.098469,5.485279,0.354058
...,...,...,...,...,...,...,...,...
156254,79.918731,10.219467,2.194567,-1.966409,2.223925,-2.163551,9.103617,-0.390168
156255,32.233273,10.106073,2.241609,-2.303629,2.322790,-2.309934,7.924955,-0.111400
156256,16.138608,10.170560,2.219764,-2.346908,2.190050,-2.466472,5.353690,0.248383
156257,16.460501,10.062854,2.205942,-2.006917,2.242657,-2.183620,6.005441,0.095388


In [171]:
df_AUTO2D = df['MolFromSmiles'].to_frame().apply(CalcAUTOCORR2D, axis = 1, result_type = 'expand')

In [172]:
df_AUTO2D.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,182,183,184,185,186,187,188,189,190,191
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.164,3.562,3.538,3.479,3.15,3.044,2.793,1.946,2.985,3.329,...,0.968,0.0,0.601,0.728,0.834,1.059,1.592,1.258,0.936,1.339
2,3.693,4.071,4.216,4.073,3.942,3.926,3.845,3.713,3.455,3.806,...,1.269,1.097,0.851,0.854,0.891,1.414,1.138,1.103,1.159,0.894
3,3.476,3.796,3.827,3.665,3.54,3.39,3.43,3.287,3.318,3.575,...,1.086,1.071,0.737,1.006,1.056,1.021,1.025,1.475,1.167,1.044
4,3.69,3.937,3.992,3.814,3.679,3.47,3.398,3.334,3.267,3.535,...,0.913,1.024,0.695,0.685,0.825,1.359,1.62,0.805,1.213,0.95
5,3.205,3.434,3.524,3.444,3.38,3.322,3.171,2.871,3.062,3.292,...,0.976,0.986,1.109,0.788,0.677,1.321,0.942,0.935,0.969,1.071


In [174]:
df_AUTO3D = df['MolFromSmiles'].to_frame().apply(CalcAUTOCORR3D, axis = 1, result_type = 'expand')

In [175]:
df_AUTO3D.head()

INDEX
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
dtype: float64

In [None]:
df_AUTO2D.rename(columns = lambda x : 'AUTO2D_' + str(x))

TypeError: rename() got an unexpected keyword argument 'columns'

In [185]:
df_CalcEEMcharges = df['MolFromSmiles'].to_frame().apply(CalcEEMcharges, axis = 1, result_type = 'expand')

In [186]:
df_CalcEEMcharges

INDEX
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
          ..
156254   NaN
156255   NaN
156256   NaN
156257   NaN
156258   NaN
Length: 156258, dtype: float64

In [None]:
df_AUT02D.head()

NameError: name 'df_AUT02D' is not defined

In [182]:
df_CoulombMat = df['MolFromSmiles'].to_frame().apply(CalcCoulombMat, axis = 1, result_type = 'expand')

In [181]:
df_CoulombMat

MolFromSmiles   NaN
dtype: float64

In [178]:
df_CoulombMat.head()

INDEX
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
dtype: float64

In [None]:
df_BCUT2D.rename(lambda x: 'BCUT2D_' + str(x), axis = 1, inplace = True)

In [None]:
df.head()

Unnamed: 0_level_0,SMILES,ACTIVE,MolFromSmiles,GetNumAtoms
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0,<rdkit.Chem.rdchem.Mol object at 0x0000024A44C...,19
2,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0,<rdkit.Chem.rdchem.Mol object at 0x0000024A452...,30
3,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x0000024A452...,27
4,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x0000024A452...,26
5,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0,<rdkit.Chem.rdchem.Mol object at 0x0000024A452...,21


In [11]:
m = Chem.MolFromSmiles('Cc1ccccc1')

In [187]:
d.CalcEEMcharges(m)

RuntimeError: Pre-condition Violation
	molecule has no conformers
	Violation occurred on line 235 in file Code\GraphMol\Descriptors\EEM.cpp
	Failed Expression: mol.getNumConformers() >= 1
	RDKIT: 2022.09.1
	BOOST: 1_78


In [None]:
d.BCUT2D(m)

[13.782105281259431,
 10.2446262174219,
 1.7152546995077484,
 -1.8223238719751815,
 1.9162789955764012,
 -1.6221555927559288,
 5.11182720852277,
 1.475756642372446]

In [None]:
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])

In [None]:
df

Unnamed: 0,A,B
0,4,9
1,4,9
2,4,9


In [None]:
df.apply(lambda x: [1, 2], axis=1, result_type='expand')

Unnamed: 0,0,1
0,1,2
1,1,2
2,1,2


In [2]:
import pandas as pd

In [8]:
f = pd.read_csv('../data/train_folds.csv')

In [None]:
for

In [13]:
f.columns

Index(['SMILES', 'ACTIVE', 'GetNumAtoms', 'ExactMolWt', 'CalcChi0n',
       'CalcChi0v', 'CalcChi1n', 'CalcChi1v', 'CalcChi2n', 'CalcChi2v',
       ...
       'MorganFingerprint_115', 'MorganFingerprint_116',
       'MorganFingerprint_117', 'MorganFingerprint_118',
       'MorganFingerprint_119', 'MorganFingerprint_120',
       'MorganFingerprint_121', 'MorganFingerprint_122',
       'MorganFingerprint_123', 'kfold'],
      dtype='object', length=444)

In [34]:
f = pd.read_csv('../data/test_folds.csv')
f.drop(['SMILES', 'CalcMolFormula'], axis = 1, inplace = True)

In [38]:
f.to_csv('../data/test_folds.csv')

In [18]:
len(f['GetNumAtoms'].value_counts())

82

In [22]:
len(f['CalcMolFormula'].value_counts())

44896

In [23]:
for col in f.columns:
    if f[col].dtype == 'int64' and len(f[col].value_counts()) < 200:
        f[col] =f[col].astype('category')

In [27]:
from sklearn.preprocessing import OneHotEncoder

In [28]:
enc = OneHotEncoder(handle_unknown='ignore')

In [29]:
enc.fit(f.drop(['SMILES', 'CalcMolFormula', 'ACTIVE'], axis = 1), )

In [30]:
enc.categories_

[array([  2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
         16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
         55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,
         68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  81,
         83,  84,  86, 101], dtype=int64),
 array([  33.02146372,   41.98485723,   59.03711378, ..., 1290.00324883,
        1319.83972539, 1447.43019964]),
 array([ 1.02456386,  1.37796447,  1.72474487, ..., 45.96353042,
        47.09366709, 54.32452698]),
 array([ 1.02456386,  1.72474487,  2.13389342, ..., 47.09366709,
        47.93988583, 55.83638487]),
 array([ 0.        ,  0.25819889,  0.61237244, ..., 27.71577954,
        28.44167499, 31.62123162]),
 array([ 0.        ,  0.25819889,  0.61237244, ..., 31.94442578,
        32.33583731, 32.37716056]),
 a

In [33]:
enc.transform(f.drop(['SMILES', 'CalcMolFormula', 'ACTIVE'], axis = 1)).toarray()


MemoryError: Unable to allocate 4.72 TiB for an array with shape (156258, 4149540) and data type float64

In [26]:
for col in f.columns:
    print(col, f[col].dtype)

SMILES object
ACTIVE float64
GetNumAtoms category
ExactMolWt float64
CalcChi0n float64
CalcChi0v float64
CalcChi1n float64
CalcChi1v float64
CalcChi2n float64
CalcChi2v float64
CalcChi3n float64
CalcChi3v float64
CalcChi4n float64
CalcChi4v float64
CalcHallKierAlpha float64
CalcKappa1 float64
CalcKappa2 float64
CalcKappa3 float64
CalcLabuteASA float64
CalcMolFormula object
CalcNumAmideBonds category
CalcNumAtomStereoCenters category
CalcNumAtoms category
CalcNumBridgeheadAtoms category
CalcNumHBA category
CalcNumHBD category
CalcNumHeavyAtoms category
CalcNumHeteroatoms category
CalcNumHeterocycles category
CalcNumLipinskiHBA category
CalcNumLipinskiHBD category
CalcNumRings category
CalcNumRotatableBonds category
CalcNumSpiroAtoms category
CalcNumUnspecifiedAtomStereoCenters category
CalcPhi float64
CalcTPSA float64
BCUT2D_0 float64
BCUT2D_1 float64
BCUT2D_2 float64
BCUT2D_3 float64
BCUT2D_4 float64
BCUT2D_5 float64
BCUT2D_6 float64
BCUT2D_7 float64
0 float64
1 float64
2 float64
3 flo

In [25]:
for col in f.columns:
    print(col, f[col].isnull().sum())

SMILES 0
ACTIVE 0
GetNumAtoms 0
ExactMolWt 0
CalcChi0n 0
CalcChi0v 0
CalcChi1n 0
CalcChi1v 0
CalcChi2n 0
CalcChi2v 0
CalcChi3n 0
CalcChi3v 0
CalcChi4n 0
CalcChi4v 0
CalcHallKierAlpha 0
CalcKappa1 0
CalcKappa2 0
CalcKappa3 0
CalcLabuteASA 0
CalcMolFormula 0
CalcNumAmideBonds 0
CalcNumAtomStereoCenters 0
CalcNumAtoms 0
CalcNumBridgeheadAtoms 0
CalcNumHBA 0
CalcNumHBD 0
CalcNumHeavyAtoms 0
CalcNumHeteroatoms 0
CalcNumHeterocycles 0
CalcNumLipinskiHBA 0
CalcNumLipinskiHBD 0
CalcNumRings 0
CalcNumRotatableBonds 0
CalcNumSpiroAtoms 0
CalcNumUnspecifiedAtomStereoCenters 0
CalcPhi 0
CalcTPSA 0
BCUT2D_0 36
BCUT2D_1 36
BCUT2D_2 36
BCUT2D_3 36
BCUT2D_4 36
BCUT2D_5 36
BCUT2D_6 36
BCUT2D_7 36
0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 0
63 0
6

In [7]:
f.head(-5)

Unnamed: 0,SMILES,ACTIVE,ACIVE,GetNumAtoms,ExactMolWt,CalcAsphericity,CalcChi0n,CalcChi0v,CalcChi1n,CalcChi1v,...,MorganFingerprint_114,MorganFingerprint_115,MorganFingerprint_116,MorganFingerprint_117,MorganFingerprint_118,MorganFingerprint_119,MorganFingerprint_120,MorganFingerprint_121,MorganFingerprint_122,MorganFingerprint_123
0,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0,,19,256.132411,,10.973884,10.973884,6.323651,6.323651,...,0,0,0,0,0,0,1,1,1,0
1,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0,,30,420.114378,,16.761958,17.578454,9.155463,9.971960,...,0,0,0,0,0,0,0,1,1,0
2,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0,,27,364.142307,,15.027436,15.027436,8.822423,8.822423,...,1,0,0,0,0,0,0,1,1,1
3,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0,,26,381.172227,,15.023244,15.839741,9.257566,10.697629,...,0,0,0,0,0,0,0,1,1,0
4,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0,,21,283.157229,,12.504213,12.504213,6.925557,6.925557,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208334,COc1ccc(N2C(=O)/C(=C\c3ccco3)N=C2SCC(=O)NC(C)(...,,,30,447.101955,,17.062192,18.634617,9.134542,10.498105,...,1,1,0,1,0,0,0,1,1,0
208335,CCN1CCN(CCn2c(=S)[nH]c3ccccc3c2=O)CC1,,,22,318.151432,,12.917286,13.733782,7.853513,8.261762,...,0,1,0,0,0,0,0,0,1,0
208336,Nc1cc(C(=O)OCC(=O)NC(c2ccccc2)c2ccccc2)ccc1Cl,,,28,394.108420,,14.970070,15.725999,8.657685,9.035649,...,1,0,0,0,0,0,0,1,1,0
208337,COCC(=O)Nc1cc(C)c2c(=O)oc3ccccc3c2n1,,,22,298.095357,,12.174065,12.174065,6.718888,6.718888,...,0,0,0,0,0,0,0,0,1,0


In [None]:
assert(len(f) == len(df))

AssertionError: 

In [None]:
f = pd.concat([f, df_BCUT2D], axis = 1)

In [None]:
f.head()

Unnamed: 0,INDEX,SMILES,ACTIVE,ACIVE,MolFromSmiles,GetNumAtoms,ExactMolWt,CalcAsphericity,CalcChi0n,CalcChi0v,...,CalcTPSA,CalcWHIM,BCUT2D_0,BCUT2D_1,BCUT2D_2,BCUT2D_3,BCUT2D_4,BCUT2D_5,BCUT2D_6,BCUT2D_7
0,1,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0,,<rdkit.Chem.rdchem.Mol object at 0x000002BEB02...,19,256.132411,,10.973884,10.973884,...,75.01,,,,,,,,,
1,2,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0,,<rdkit.Chem.rdchem.Mol object at 0x000002BEB02...,30,420.114378,,16.761958,17.578454,...,83.67,,16.140365,10.177342,2.317179,-2.222381,2.265224,-2.317194,6.233673,-0.113244
2,3,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0,,<rdkit.Chem.rdchem.Mol object at 0x000002BEB02...,27,364.142307,,15.027436,15.027436,...,89.12,,32.13354,10.069004,2.203357,-2.083512,2.366658,-2.027544,7.209717,0.103803
3,4,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0,,<rdkit.Chem.rdchem.Mol object at 0x000002BEB02...,26,381.172227,,15.023244,15.839741,...,78.95,,16.535517,10.075537,2.112934,-2.056819,2.081497,-2.162756,5.974898,-0.142005
4,5,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0,,<rdkit.Chem.rdchem.Mol object at 0x000002BEB02...,21,283.157229,,12.504213,12.504213,...,30.49,,32.233427,10.209864,2.246205,-2.253552,2.213854,-2.406944,7.887337,-0.117826


In [9]:
import rdkit.Chem.Fragments as f


In [22]:
f.fr_Ar_N(m)

0

In [23]:
from rdkit.Chem import AllChem

In [24]:
fp = AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=124)

In [50]:
def GetMorganFingerprint(mol):
    try:
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol[0],2,nBits=124))
    except:
        return [np.nan]*124

In [28]:
np.array(fp)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [36]:
df['MolFromSmiles']

INDEX
1         <rdkit.Chem.rdchem.Mol object at 0x0000018C9E3...
2         <rdkit.Chem.rdchem.Mol object at 0x0000018C9E3...
3         <rdkit.Chem.rdchem.Mol object at 0x0000018C9E3...
4         <rdkit.Chem.rdchem.Mol object at 0x0000018C9E3...
5         <rdkit.Chem.rdchem.Mol object at 0x0000018C9E3...
                                ...                        
156254    <rdkit.Chem.rdchem.Mol object at 0x0000018D3D1...
156255    <rdkit.Chem.rdchem.Mol object at 0x0000018D3D1...
156256    <rdkit.Chem.rdchem.Mol object at 0x0000018D3D1...
156257    <rdkit.Chem.rdchem.Mol object at 0x0000018D3D1...
156258    <rdkit.Chem.rdchem.Mol object at 0x0000018D3D1...
Name: MolFromSmiles, Length: 156258, dtype: object

In [52]:
df_MorganFingerprint = df['MolFromSmiles'].to_frame().apply(GetMorganFingerprint, axis = 1, result_type = 'expand')

In [None]:
df['MolFromSmiles'].to_fram

In [51]:
fp = AllChem.GetMorganFingerprintAsBitVect(df['MolFromSmiles'].iloc[1],2,nBits=124)

In [46]:
np.array(fp)

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [53]:
df_MorganFingerprint

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,114,115,116,117,118,119,120,121,122,123
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,1,0
2,0,0,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
3,0,1,0,1,1,1,1,1,0,0,...,1,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
5,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156254,0,0,1,0,0,1,0,1,1,1,...,1,1,1,0,0,0,1,0,1,0
156255,1,0,1,0,1,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
156256,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
156257,1,0,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [54]:
df_MorganFingerprint.rename(columns = lambda x : 'MorganFingerprint_' + str(x), inplace = True)

In [55]:
df_MorganFingerprint

Unnamed: 0_level_0,MorganFingerprint_0,MorganFingerprint_1,MorganFingerprint_2,MorganFingerprint_3,MorganFingerprint_4,MorganFingerprint_5,MorganFingerprint_6,MorganFingerprint_7,MorganFingerprint_8,MorganFingerprint_9,...,MorganFingerprint_114,MorganFingerprint_115,MorganFingerprint_116,MorganFingerprint_117,MorganFingerprint_118,MorganFingerprint_119,MorganFingerprint_120,MorganFingerprint_121,MorganFingerprint_122,MorganFingerprint_123
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,1,0
2,0,0,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
3,0,1,0,1,1,1,1,1,0,0,...,1,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
5,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156254,0,0,1,0,0,1,0,1,1,1,...,1,1,1,0,0,0,1,0,1,0
156255,1,0,1,0,1,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
156256,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
156257,1,0,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/train_folds.csv')

In [5]:
len(df.columns)

443

In [10]:
from fastai.tabular.all import *
import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm import tqdm


from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest


def create_imputation(dataframe):
    df = dataframe.copy()
    imputation = {}
    for column in df.columns:
        # Numerical Columns replace with mean
        if column not in ["CLASS", "ID"] and (df[column].dtype == 'float64' or df[column].dtype == 'int64'):
            # in rare case of all values being NaN, replace with 0
            if len(pd.unique(df[column].dropna())) == 0:
                df.loc[:, column] = df[column].fillna(0) 
            

            imputation[column] = df[column].mean()
            df.loc[:, column] = df[column].fillna(df[column].mean())

        
        # Categorical Columns replace with mode
        elif column not in ["SMILES", "ACTIVE", "ID"] and (df[column].dtype == 'object' or df[column].dtype == 'category'):
            # in rare case of all values being NaN, replace with 
            if len(pd.unique(df[column].dropna())) == 0:
                if df[column].dtype == 'object':
                    df.loc[:, column] = df[column].fillna("")
                elif df[column].dtype == 'category':
                    df.loc[:, column] = df[column].fillna(df[column].cat.categories[0])
            
            imputation[column] = df[column].mode()[0]
            df.loc[:, column] = df[column].fillna(df[column].mode()[0])
            
    return df, imputation

# Input to apply_imputation:
# df         - a dataframe
# imputation - a mapping (dictionary) from column name to value that should replace missing values
#
# Output from apply_imputation:
# df - a new dataframe, where each missing value has been replaced according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Consider using fillna
def apply_imputation(dataframe, imputation):
    df = dataframe.copy()
    for column in df.columns:
        if column not in ["CLASS", "ID"] and imputation.get(column) is not None:
                df.loc[:, column] = df[column].fillna(imputation[column])
    return df


  from .autonotebook import tqdm as notebook_tqdm


In [25]:
df = pd.read_csv('../data/train_folds.csv', index_col = 0)

selection = SelectKBest(f_classif, k=200)

# Train data

X_train = df[df.kfold != 0].reset_index().copy()
X_train = X_train.drop(['kfold'], axis=1)
X_train['ACTIVE'] = X_train['ACTIVE'].astype('category')
X_train, imputation = create_imputation(X_train)

y_train = X_train['ACTIVE']
x_train = X_train.drop(['ACTIVE'], axis = 1)

selection.fit(x_train, y_train)
x_train = selection.transform(x_train)
X_train = pd.DataFrame(x_train, columns = [str(i) for i in range(x_train.shape[1])])
X_train['ACTIVE'] = y_train

# valid data



X_valid = df[df.kfold == 0].reset_index().copy()
X_valid = X_valid.drop(['kfold'], axis=1)
X_valid = apply_imputation(X_valid, imputation)
X_valid['ACTIVE'] = X_valid['ACTIVE'].astype('category')


y_valid = X_valid['ACTIVE']
x_valid = X_valid.drop(['ACTIVE'], axis = 1)


selection.transform(x_valid)
X_valid = pd.DataFrame(x_valid, columns = [str(i) for i in range(x_valid.shape[1])])
X_valid['ACTIVE'] = y_valid



In [27]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,ACTIVE
0,28.0,414.055339,14.808264,16.380689,8.153276,9.516839,5.908177,7.287112,3.893732,5.105228,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,35.0,495.110021,19.049852,19.866348,10.557865,11.374361,7.568361,8.577460,5.438234,6.421176,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,27.0,381.151098,15.566401,16.382898,9.235960,10.052457,6.889444,8.114189,5.183302,6.321774,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
3,36.0,488.228437,21.026829,21.026829,11.123477,11.123477,7.941228,7.941228,5.195582,5.195582,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
4,26.0,383.140259,15.711109,16.527606,8.680975,9.497472,6.659349,7.679970,3.680630,4.867917,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125001,30.0,415.110296,16.019153,16.835649,9.319017,10.802180,6.918534,8.783510,4.810824,6.476902,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
125002,35.0,479.194402,20.166394,20.166394,11.841969,11.841969,9.148556,9.148556,6.846842,6.846842,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
125003,23.0,330.103813,13.060436,13.876933,7.196765,8.182364,4.653565,5.617614,3.116457,4.106505,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
125004,27.0,371.180919,15.074748,15.074748,9.011983,9.011983,6.562153,6.562153,4.510827,4.510827,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [1]:
import pandas as pd

In [5]:
df  = pd.read_csv('../data/features.csv')

In [14]:
df.ACTIVE.value_counts().to_frame()

Unnamed: 0,ACTIVE
0.0,154528
1.0,1730


In [None]:
df = df.