# Use RDKit to Make More Features

In [None]:
%%bash -e
if ! [[ -f ./xyz2mol.py ]]; then
  wget https://raw.githubusercontent.com/jensengroup/xyz2mol/master/xyz2mol.py
fi

In [2]:
# ideas/code from https://www.kaggle.com/sunhwan/using-rdkit-for-atomic-feature-and-visualization
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import preprocessing
import gc
from multiprocessing import Pool
from tqdm import tqdm
from glob import glob

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem import Draw
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
DrawingOptions.bondLineWidth=1.8
from rdkit.Chem.rdmolops import SanitizeFlags

# https://github.com/jensengroup/xyz2mol
from xyz2mol import xyz2mol, xyz2AC, AC2mol, read_xyz_file
import pickle

gc.collect()

30

In [4]:
PATH = Path('../input')

# again, only using 5% of data for the committed kernel to work...
train = pd.read_csv(PATH/'train.csv')# [::20]
test = pd.read_csv(PATH/'test.csv')

In [5]:
train['atom1'] = train['type'].map(lambda x: str(x)[2])
train['atom2'] = train['type'].map(lambda x: str(x)[3])
test['atom1'] = test['type'].map(lambda x: str(x)[2])
test['atom2'] = test['type'].map(lambda x: str(x)[3])

In [6]:
lbl = preprocessing.LabelEncoder()
for i in range(4):
    train['type'+str(i)] = lbl.fit_transform(train['type'].map(lambda x: str(x)[i]))
    test['type'+str(i)] = lbl.transform(test['type'].map(lambda x: str(x)[i]))

In [7]:
structures = pd.read_csv(PATH/'structures.csv').rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'})
train = pd.merge(train, structures, how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
test = pd.merge(test, structures, how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
del structures

In [8]:
structures = pd.read_csv(PATH/'structures.csv').rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'})
train = pd.merge(train, structures, how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
test = pd.merge(test, structures, how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
del structures

In [9]:
gc.collect()

2545

In [10]:
def feature_atom(atom):
    prop = {}
    nb = [a.GetSymbol() for a in atom.GetNeighbors()] # neighbor atom type symbols
    nb_h = sum([_ == 'H' for _ in nb]) # number of hydrogen as neighbor
    nb_o = sum([_ == 'O' for _ in nb]) # number of oxygen as neighbor
    nb_c = sum([_ == 'C' for _ in nb]) # number of carbon as neighbor
    nb_n = sum([_ == 'N' for _ in nb]) # number of nitrogen as neighbor
    nb_na = len(nb) - nb_h - nb_o - nb_n - nb_c
    prop['degree'] = atom.GetDegree()
    prop['hybridization'] = int(atom.GetHybridization())
    prop['inring'] = int(atom.IsInRing()) # is the atom in a ring?
    prop['inring3'] = int(atom.IsInRingSize(3)) # is the atom in a ring size of 3?
    prop['inring4'] = int(atom.IsInRingSize(4)) # is the atom in a ring size of 4?
    prop['inring5'] = int(atom.IsInRingSize(5)) # ...
    prop['inring6'] = int(atom.IsInRingSize(6))
    prop['inring7'] = int(atom.IsInRingSize(7))
    prop['inring8'] = int(atom.IsInRingSize(8))
    prop['nb_h'] = nb_h
    prop['nb_o'] = nb_o
    prop['nb_c'] = nb_c
    prop['nb_n'] = nb_n
    prop['nb_na'] = nb_na
    return prop

In [11]:
def _features(args):
    idx, row = args
    molecule_name = row.molecule_name
    atom_index_0 = int(row.atom_index_0)
    atom_index_1 = int(row.atom_index_1)
    
    prop = {'molecule_name': molecule_name,
            'atom_index_0': atom_index_0,
            'atom_index_1': atom_index_1}

    # atom_0 is always hydrogen
    m = MolFromXYZ(PATH/f'structures/{molecule_name}.xyz') # less memory intensive in multiprocessing.Pool
    a0 = m.GetAtomWithIdx(atom_index_0)

    a1 = m.GetAtomWithIdx(atom_index_1)
    a1_prop = feature_atom(a1)
    prop.update({'a1_'+k: a1_prop[k] for k in a1_prop.keys()})

    # skipping below for time constraint
    # neighbor of atom_0
    try:
        a0_nb_idx = [a.GetIdx() for a in a0.GetNeighbors() if a.GetIdx() != a0].pop()
    except:
        if molecule_name in nblist and atom_index_0 in nblist[molecule_name]:
            a0_nb_idx = nblist[molecule_name][atom_index_0]
        else:
            print(molecule_name)
            print(row)

    a0_nb = m.GetAtomWithIdx(a0_nb_idx)
    a0_nb_prop = feature_atom(a0_nb)
    for k in a0_nb_prop.keys():
        prop['a0_nb_'+k] = a0_nb_prop[k]
        
    c = m.GetConformer()
    #prop['dist_a0_a0_nb'] = np.linalg.norm(c.GetAtomPosition(atom_index_0) - c.GetAtomPosition(a0_nb_idx))
    prop['x_a0_nb'] = c.GetAtomPosition(a0_nb_idx)[0]
    prop['y_a0_nb'] = c.GetAtomPosition(a0_nb_idx)[1]
    prop['z_a0_nb'] = c.GetAtomPosition(a0_nb_idx)[2]

    # neighbor of atom_1
    try:
        a1_nb_idx = [a.GetIdx() for a in a1.GetNeighbors() if a.GetIdx() != a1].pop()
    except:
        if molecule_name in nblist and atom_index_1 in nblist[molecule_name]:
            a1_nb_idx = nblist[molecule_name][atom_index_1]
        else:
            print(molecule_name)
            print(row)
    a1_nb = m.GetAtomWithIdx(a1_nb_idx)
    a1_nb_prop = feature_atom(a1_nb)
    for k in a1_nb_prop.keys():
        prop['a1_nb_'+k] = a1_nb_prop[k]
    prop['x_a1_nb'] = c.GetAtomPosition(a1_nb_idx)[0]
    prop['y_a1_nb'] = c.GetAtomPosition(a1_nb_idx)[1]
    prop['z_a1_nb'] = c.GetAtomPosition(a1_nb_idx)[2]
    #prop['dist_a1_a1_nb'] = np.linalg.norm(c.GetAtomPosition(a1.GetIdx()) - c.GetAtomPosition(a1_nb.GetIdx()))
    #prop['dist_a0_a1_nb'] = np.linalg.norm(c.GetAtomPosition(a0.GetIdx()) - c.GetAtomPosition(a1_nb.GetIdx()))
    #prop['dist_a1_a0_nb'] = np.linalg.norm(c.GetAtomPosition(a1.GetIdx()) - c.GetAtomPosition(a0_nb.GetIdx()))
    return prop

def features(df):
    prop = []
    n_cpu = 42
    with Pool(n_cpu) as p:
        n = len(df)
        res = _features((0, df.iloc[0]))
        keys = res.keys()
        _df = df[['molecule_name', 'atom_index_0', 'atom_index_1']]
        with tqdm(total=n) as pbar:
            for res in p.imap_unordered(_features, _df.iterrows()):
                # this is faster than using dict
                prop.append([res[_] for _ in keys])
                pbar.update()
        del _df
    
    prop = pd.DataFrame.from_records(prop, columns=keys)
    df = pd.merge(df, prop, how='left', on=['molecule_name', 'atom_index_0', 'atom_index_1'])
    return df

In [12]:
# extract some simple atomic feature for atom_index_0 and atom_index_1

# use cached rdkit mol object to save memory
if 'mols' in locals(): del mols
import gc
gc.collect()

# fix atom bonds
# dsgdb9nsd_059827: hydrogen has is far apart
nblist = {
    'dsgdb9nsd_059827': {
        13: 3
    }
}


In [13]:
CACHEDIR = Path('./')

def chiral_stereo_check(mol):
    # avoid sanitization error e.g., dsgdb9nsd_037900.xyz
    Chem.SanitizeMol(mol, SanitizeFlags.SANITIZE_ALL - SanitizeFlags.SANITIZE_PROPERTIES)
    Chem.DetectBondStereochemistry(mol,-1)
    # ignore stereochemistry for now
    #Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
    #Chem.AssignAtomChiralTagsFromStructure(mol,-1)
    return mol

def xyz2mol(atomicNumList,charge,xyz_coordinates,charged_fragments,quick):
    AC,mol = xyz2AC(atomicNumList,xyz_coordinates)
    new_mol = AC2mol(mol,AC,atomicNumList,charge,charged_fragments,quick)
    new_mol = chiral_stereo_check(new_mol)
    return new_mol

def MolFromXYZ(filename):
    charged_fragments = True
    quick = True
    cache_filename = CACHEDIR/f'{filename.stem}.pkl'
    if cache_filename.exists():
        return pickle.load(open(cache_filename, 'rb'))
    else:
        try:
            atomicNumList, charge, xyz_coordinates = read_xyz_file(filename)
            mol = xyz2mol(atomicNumList, charge, xyz_coordinates, charged_fragments, quick)
            # commenting this out for kernel to work.
            # for some reason kernel runs okay interactively, but fails when it is committed.
            #pickle.dump(mol, open(cache_filename, 'wb'))
        except:
            print(filename)
    return mol

# Create Train Features by Type and Save as Parquet

In [14]:
for t, d in train.groupby('type'):
    print(f'Running for {t} with shape {d.shape}')
    d = features(d)
    d.to_parquet(f'../data/FE020/OnlyNew/FE020-train-{t}.parquet')

Running for 1JHC with shape (709416, 18)


100%|██████████| 709416/709416 [03:49<00:00, 3084.96it/s]


Running for 1JHN with shape (43363, 18)


100%|██████████| 43363/43363 [00:14<00:00, 2906.22it/s]


Running for 2JHC with shape (1140674, 18)


100%|██████████| 1140674/1140674 [06:12<00:00, 3062.72it/s]


Running for 2JHH with shape (378036, 18)


100%|██████████| 378036/378036 [01:58<00:00, 3181.88it/s]


Running for 2JHN with shape (119253, 18)


100%|██████████| 119253/119253 [00:39<00:00, 3042.82it/s]


Running for 3JHC with shape (1510379, 18)


100%|██████████| 1510379/1510379 [08:07<00:00, 3097.29it/s]


Running for 3JHH with shape (590611, 18)


100%|██████████| 590611/590611 [03:05<00:00, 3186.34it/s]


Running for 3JHN with shape (166415, 18)


100%|██████████| 166415/166415 [00:54<00:00, 3033.52it/s]


In [15]:
del train
gc.collect()

7

# Create Test Features

In [16]:
for t, d in test.groupby('type'):
    print(f'Running for {t} with shape {d.shape}')
    d = features(d)
    d.to_parquet(f'../data/FE020/OnlyNew/FE020-test-{t}.parquet')

Running for 1JHC with shape (380609, 17)


100%|██████████| 380609/380609 [02:02<00:00, 3118.00it/s]


Running for 1JHN with shape (24195, 17)


100%|██████████| 24195/24195 [00:08<00:00, 3023.05it/s]


Running for 2JHC with shape (613138, 17)


100%|██████████| 613138/613138 [03:18<00:00, 3083.54it/s]


Running for 2JHH with shape (203126, 17)


100%|██████████| 203126/203126 [01:05<00:00, 3118.02it/s]


Running for 2JHN with shape (64424, 17)


100%|██████████| 64424/64424 [00:19<00:00, 3301.16it/s]


Running for 3JHC with shape (811999, 17)


100%|██████████| 811999/811999 [04:25<00:00, 3061.97it/s]


Running for 3JHH with shape (317435, 17)


100%|██████████| 317435/317435 [01:41<00:00, 3133.07it/s]


Running for 3JHN with shape (90616, 17)


100%|██████████| 90616/90616 [00:29<00:00, 3057.03it/s]


# Join with FE019

In [1]:
import os

In [2]:
os.listdir('../data/FE020/OnlyNew/')

['FE020-train-3JHN.parquet',
 'FE020-test-1JHC.parquet',
 'FE020-train-2JHH.parquet',
 'FE020-test-2JHC.parquet',
 'FE020-test-2JHN.parquet',
 'FE020-test-3JHH.parquet',
 'FE020-train-1JHN.parquet',
 'FE020-test-2JHH.parquet',
 'FE020-train-2JHN.parquet',
 'FE020-train-3JHH.parquet',
 'FE020-test-3JHN.parquet',
 'FE020-test-1JHN.parquet',
 'FE020-test-3JHC.parquet',
 'FE020-train-2JHC.parquet',
 'FE020-train-1JHC.parquet',
 'FE020-train-3JHC.parquet']

In [3]:
import os
import pandas as pd
#for file in tqdm(os.listdir('../data/FE020/OnlyNew/')):
for file in ['FE020-train-3JHC.parquet']:

    print(f'Runing for {file}')
    df = pd.read_parquet('../data/FE020/OnlyNew/'+file)
    df2 = pd.read_parquet('../data/FE019/'+file.replace('FE020','FE019'))

    df2 = df2.drop(['mulliken_charge_0',
                     'mulliken_charge_1',
                     'mulliken_charge_closest_0',
                     'mulliken_charge_closest_1',
                     'mulliken_charge_2nd_closest_0',
                     'mulliken_charge_2nd_closest_1'], axis=1)
    df2 = df2.drop([x for x in df2.columns if '_atomic_mass' in x] + [x for x in df2.columns if '_atomic_number' in x], axis=1)
    df2 = df2.sort_values('id')
    df = df.sort_values('id')
    if df.shape[0] == df2.shape[0]:
        print('Dimensions are good')
        length_before_merge = df.shape[0]
    else:
        print('ERROR - wrong dimensions')
    if 'train' in file:
        df_combined = pd.concat([df2, df.drop(['id','molecule_name',
                                               'atom_index_0',
                                               'atom_index_1','type',
                                               'scalar_coupling_constant'], axis=1)],
                                sort=False,
                                axis=1)
    elif 'test' in file:
        df_combined = pd.concat([df2, df.drop(['id','molecule_name',
                                               'atom_index_0',
                                               'atom_index_1','type'], axis=1)],
                                sort=False,
                                axis=1)
    if df_combined.shape[0] == length_before_merge:
        print('Good shape after merge')
    else:
        print('Error- wrong shape of merge')
    df_combined.to_parquet('../data/FE020/' + file)

Runing for FE020-train-3JHC.parquet
Dimensions are good
Good shape after merge


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('../data/FE020/FE020-test-1JHC.parquet')

In [4]:
df.tail()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,exact_mass_x,atom0_valence,atom0_spin_multiplicity,exact_mass_y,...,a1_nb_inring7,a1_nb_inring8,a1_nb_nb_h,a1_nb_nb_o,a1_nb_nb_c,a1_nb_nb_n,a1_nb_nb_na,x_a1_nb,y_a1_nb,z_a1_nb
380604,7163650,dsgdb9nsd_133885,11,2,1JHC,,1.007825,1,0,12.0,...,0,0,0,0,1,0,0,-1.454004,-0.967309,1.459246
380605,7163659,dsgdb9nsd_133885,12,3,1JHC,,1.007825,1,0,12.0,...,0,0,0,0,1,0,0,0.277779,-2.697872,0.19577
380606,7163668,dsgdb9nsd_133885,13,4,1JHC,,1.007825,1,0,12.0,...,0,0,0,0,1,0,0,2.515854,-1.151784,0.527369
380607,7163679,dsgdb9nsd_133885,14,7,1JHC,,1.007825,1,0,12.0,...,0,0,0,0,1,0,0,0.013699,1.199431,-1.680192
380608,7163688,dsgdb9nsd_133885,15,8,1JHC,,1.007825,1,0,12.0,...,0,0,0,0,1,0,0,1.260745,-1.246754,-1.906767


In [6]:
df.select_dtypes('object')

Unnamed: 0,molecule_name,type,molecule_name.1,atom1,atom2
0,dsgdb9nsd_000004,1JHC,dsgdb9nsd_000004,H,C
1,dsgdb9nsd_000004,1JHC,dsgdb9nsd_000004,H,C
2,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
3,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
4,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
5,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
6,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
7,dsgdb9nsd_000015,1JHC,dsgdb9nsd_000015,H,C
8,dsgdb9nsd_000016,1JHC,dsgdb9nsd_000016,H,C
9,dsgdb9nsd_000016,1JHC,dsgdb9nsd_000016,H,C
