# FE009 - More OpenBabel Features
- Add more neighbors
- Add weight of neightbor atoms

In [1]:
import pandas as pd
import matplotlib.pylab as plt

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
%%time
angle_details = pd.read_csv('../data/openbabel/angle_details.csv')
atom_details = pd.read_csv('../data/openbabel/atom_details.csv')
bond_details = pd.read_csv('../data/openbabel/bond_details.csv')
closest = pd.read_csv('../data/openbabel/closest.csv')
distances = pd.read_csv('../data/openbabel/distances.csv')
molecule_details = pd.read_csv('../data/openbabel/molecule_details.csv')
torsion_details = pd.read_csv('../data/openbabel/torsion_details.csv')

CPU times: user 45.1 s, sys: 3.1 s, total: 48.2 s
Wall time: 52.7 s


In [4]:
tt = pd.concat([train, test], axis=0, sort=False)

In [5]:
tt.shape

(7163689, 6)

In [6]:
tt.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


# Atom Details

In [7]:
atom_details.head()

Unnamed: 0,molecule_name,atom_idx,atomic_mass,atomic_number,exact_mass,valence,spin_multiplicity
0,dsgdb9nsd_033805,0,14.0067,7,14.003074,1,0
1,dsgdb9nsd_033805,1,12.0107,6,12.0,2,0
2,dsgdb9nsd_033805,2,12.0107,6,12.0,4,0
3,dsgdb9nsd_033805,3,12.0107,6,12.0,4,0
4,dsgdb9nsd_033805,4,12.0107,6,12.0,4,0


In [8]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','atom_index_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'atom0_atomic_number',
                     'atomic_mass':'atom0_atomic_mass',
                     'valence':'atom0_valence',
                     'spin_multiplicity':'atom0_spin_multiplicity'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','atom_index_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'atom1_atomic_number',
                     'atomic_mass':'atom1_atomic_mass',
                     'valence':'atom1_valence',
                     'spin_multiplicity':'atom1_spin_multiplicity'})
tt.shape

CPU times: user 5.12 s, sys: 3.6 s, total: 8.73 s
Wall time: 8.74 s


# Angles

In [9]:
%%time
tt = pd.merge(tt,
         angle_details.groupby(['molecule_name','left_atom_idx','middle_atom_idx']).mean() \
              .reset_index(drop=False),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','left_atom_idx','middle_atom_idx'],
         how='left') \
    .drop(['left_atom_idx','middle_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'angle':'left_middle_average_angle'})

CPU times: user 7.71 s, sys: 3.84 s, total: 11.5 s
Wall time: 11.6 s


In [10]:
%%time
tt = pd.merge(tt,
         angle_details.groupby(['molecule_name','right_atom_idx','middle_atom_idx']).mean() \
              .reset_index(drop=False),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','right_atom_idx','middle_atom_idx'],
         how='left') \
    .drop(['left_atom_idx','middle_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'angle':'right_middle_average_angle'})

CPU times: user 7.79 s, sys: 3.96 s, total: 11.7 s
Wall time: 11.8 s


In [11]:
%%time
tt = pd.merge(tt,
         distances,
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','left_atom_idx','right_atom_idx'],
         how='left')\
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt.shape

CPU times: user 12 s, sys: 4.37 s, total: 16.4 s
Wall time: 16.4 s


# Closest Neighbors

In [12]:
# Fix bug with the indexes being off by one
if closest['closest'].min() == 1:
    closest['atom_index'] = closest['atom_index'] - 1
    for col in closest.columns:
        if 'close' in col:
            closest[col] = closest[col] - 1

In [13]:
%%time
tt = pd.merge(tt,
         closest[['molecule_name','atom_index','closest','2nd_closest','3rd_closest','4th_closest',
                  '5th_closest', '6th_closest','7th_closest','8th_closest','9th_closest','10th_closest']],
        left_on=['molecule_name','atom_index_0'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'closest':'closest_to_0',
                     '2nd_closest':'2nd_closest_to_0',
                     '3rd_closest':'3rd_closest_to_0',
                     '4th_closest':'4th_closest_to_0',
                     '5th_closest':'5th_closest_to_0',
                     '6th_closest':'6th_closest_to_0',
                     '7th_closest':'7th_closest_to_0',
                     '8th_closest':'8th_closest_to_0',
                     '9th_closest':'9th_closest_to_0',
                     '10th_closest':'10th_closest_to_0'})

CPU times: user 3.68 s, sys: 3.68 s, total: 7.36 s
Wall time: 7.38 s


In [14]:
%%time
tt = pd.merge(tt,
         closest[['molecule_name','atom_index','closest','2nd_closest','3rd_closest','4th_closest',
                  '5th_closest', '6th_closest','7th_closest','8th_closest','9th_closest','10th_closest']],
        left_on=['molecule_name','atom_index_1'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'closest':'closest_to_1',
                     '2nd_closest':'2nd_closest_to_1',
                     '3rd_closest':'3rd_closest_to_1',
                     '4th_closest':'4th_closest_to_1',
                     '5th_closest':'5th_closest_to_1',
                     '6th_closest':'6th_closest_to_1',
                     '7th_closest':'7th_closest_to_1',
                     '8th_closest':'8th_closest_to_1',
                     '9th_closest':'9th_closest_to_1',
                     '10th_closest':'10th_closest_to_1'})

CPU times: user 4.71 s, sys: 5.55 s, total: 10.3 s
Wall time: 10.3 s


In [15]:
tt.shape

(7163689, 40)

In [16]:
tt['is_closest_pair'] = tt['closest_to_0'] == tt['atom_index_1']
tt.shape

(7163689, 41)

# Distance to neighbors

## To atom0

In [17]:
%%time
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

CPU times: user 13.7 s, sys: 7.56 s, total: 21.3 s
Wall time: 21.3 s


In [18]:
%%time
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','2nd_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_2nd_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

CPU times: user 14.2 s, sys: 7.74 s, total: 22 s
Wall time: 22 s


In [19]:
%%time
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','3rd_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_3rd_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','4th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_4th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','5th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_5th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','6th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_6th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','7th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_7th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

CPU times: user 1min 45s, sys: 53.7 s, total: 2min 38s
Wall time: 2min 38s


In [20]:
%%time
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','8th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_8th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','9th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_9th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','10th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_10th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt.shape

CPU times: user 1min 21s, sys: 43.8 s, total: 2min 5s
Wall time: 2min 5s


## To atom1

In [21]:
tt.shape

(7163689, 61)

In [22]:
tt.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom0_atomic_mass,atom0_atomic_number,exact_mass_x,atom0_valence,atom0_spin_multiplicity,atom1_atomic_mass,atom1_atomic_number,exact_mass_y,atom1_valence,atom1_spin_multiplicity,left_middle_average_angle,right_middle_average_angle,distance,is_bond,closest_to_0,2nd_closest_to_0,3rd_closest_to_0,4th_closest_to_0,5th_closest_to_0,6th_closest_to_0,7th_closest_to_0,8th_closest_to_0,9th_closest_to_0,10th_closest_to_0,closest_to_1,2nd_closest_to_1,3rd_closest_to_1,4th_closest_to_1,5th_closest_to_1,6th_closest_to_1,7th_closest_to_1,8th_closest_to_1,9th_closest_to_1,10th_closest_to_1,is_closest_pair,distance_closest_to_0,is_bond_closest_to_0,distance_2nd_closest_to_0,is_bond_2nd_closest_to_0,distance_3rd_closest_to_0,is_bond_3rd_closest_to_0,distance_4th_closest_to_0,is_bond_4th_closest_to_0,distance_5th_closest_to_0,is_bond_5th_closest_to_0,distance_6th_closest_to_0,is_bond_6th_closest_to_0,distance_7th_closest_to_0,is_bond_7th_closest_to_0,distance_8th_closest_to_0,is_bond_8th_closest_to_0,distance_9th_closest_to_0,is_bond_9th_closest_to_0,distance_10th_closest_to_0,is_bond_10th_closest_to_0
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1.00794,1,1.007825,1,0,12.0107,6,12.0,4,0,,,1.091953,True,0,2,3.0,4.0,,,,,,,3,4,2.0,1.0,,,,,,,True,1.091953,True,1.78312,False,1.783147,False,1.783157,False,,,,,,,,,,,,
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,1.00794,1,1.007825,1,0,1.00794,1,1.007825,1,0,,,1.78312,False,0,2,3.0,4.0,,,,,,,0,1,4.0,3.0,,,,,,,False,1.091953,True,1.78312,False,1.783147,False,1.783157,False,,,,,,,,,,,,
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,1.00794,1,1.007825,1,0,1.00794,1,1.007825,1,0,,,1.783147,False,0,2,3.0,4.0,,,,,,,0,1,4.0,2.0,,,,,,,False,1.091953,True,1.78312,False,1.783147,False,1.783157,False,,,,,,,,,,,,
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,1.00794,1,1.007825,1,0,1.00794,1,1.007825,1,0,,,1.783157,False,0,2,3.0,4.0,,,,,,,0,3,2.0,1.0,,,,,,,False,1.091953,True,1.78312,False,1.783147,False,1.783157,False,,,,,,,,,,,,
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.00794,1,1.007825,1,0,12.0107,6,12.0,4,0,,,1.091952,True,0,1,4.0,3.0,,,,,,,3,4,2.0,1.0,,,,,,,True,1.091952,True,1.78312,False,1.783148,False,1.783158,False,,,,,,,,,,,,


In [23]:
%%time
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', 'closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '2nd_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_2nd_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '3rd_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_3rd_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '4th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_4th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)

CPU times: user 2min 2s, sys: 1min 7s, total: 3min 10s
Wall time: 3min 10s


In [24]:
%%time
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '5th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_5th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '6th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_6th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '7th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_7th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)

CPU times: user 2min, sys: 1min 1s, total: 3min 1s
Wall time: 3min 2s


In [25]:
%%time
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '8th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_8th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '9th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_9th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = pd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '10th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_10th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt.shape

CPU times: user 2min 28s, sys: 1min 11s, total: 3min 39s
Wall time: 3min 39s


# Atom Details of Neighbors

In [26]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'closest_to_0_atomic_number',
                     'atomic_mass':'closest_to_0_atomic_mass',
                     'valence':'closest_to_0_valence',
                     'spin_multiplicity':'closest_to_0_spin_multiplicity',
                     'exact_mass': 'closest_to_0_exact_mass'})
tt.shape

CPU times: user 45.6 s, sys: 26.6 s, total: 1min 12s
Wall time: 1min 12s


In [27]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','2nd_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'2nd_closest_to_0_atomic_number',
                     'atomic_mass':'2nd_closest_to_0_atomic_mass',
                     'valence':'2nd_closest_to_0_valence',
                     'spin_multiplicity':'2nd_closest_to_0_spin_multiplicity',
                     'exact_mass': '2nd_closest_to_0_exact_mass'})
tt.shape

CPU times: user 47.3 s, sys: 30.1 s, total: 1min 17s
Wall time: 1min 17s


In [28]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','3rd_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'3rd_closest_to_0_atomic_number',
                     'atomic_mass':'3rd_closest_to_0_atomic_mass',
                     'valence':'3rd_closest_to_0_valence',
                     'spin_multiplicity':'3rd_closest_to_0_spin_multiplicity',
                     'exact_mass': '3rd_closest_to_0_exact_mass'})
tt.shape

CPU times: user 50.2 s, sys: 34.8 s, total: 1min 24s
Wall time: 1min 25s


In [29]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','4th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'4th_closest_to_0_atomic_number',
                     'atomic_mass':'4th_closest_to_0_atomic_mass',
                     'valence':'4th_closest_to_0_valence',
                     'spin_multiplicity':'4th_closest_to_0_spin_multiplicity',
                     'exact_mass': '4th_closest_to_0_exact_mass'})
tt.shape

CPU times: user 52.1 s, sys: 39.6 s, total: 1min 31s
Wall time: 1min 31s


In [30]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','5th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'5th_closest_to_0_atomic_number',
                     'atomic_mass':'5th_closest_to_0_atomic_mass',
                     'valence':'5th_closest_to_0_valence',
                     'spin_multiplicity':'5th_closest_to_0_spin_multiplicity',
                     'exact_mass': '5th_closest_to_0_exact_mass'})

CPU times: user 54.9 s, sys: 43.9 s, total: 1min 38s
Wall time: 1min 38s


In [31]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','6th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'6th_closest_to_0_atomic_number',
                     'atomic_mass':'6th_closest_to_0_atomic_mass',
                     'valence':'6th_closest_to_0_valence',
                     'spin_multiplicity':'6th_closest_to_0_spin_multiplicity',
                     'exact_mass': '6th_closest_to_0_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','7th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'7th_closest_to_0_atomic_number',
                     'atomic_mass':'7th_closest_to_0_atomic_mass',
                     'valence':'7th_closest_to_0_valence',
                     'spin_multiplicity':'7th_closest_to_0_spin_multiplicity',
                     'exact_mass': '7th_closest_to_0_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','8th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'8th_closest_to_0_atomic_number',
                     'atomic_mass':'8th_closest_to_0_atomic_mass',
                     'valence':'8th_closest_to_0_valence',
                     'spin_multiplicity':'8th_closest_to_0_spin_multiplicity',
                     'exact_mass': '8th_closest_to_0_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','9th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'9th_closest_to_0_atomic_number',
                     'atomic_mass':'9th_closest_to_0_atomic_mass',
                     'valence':'9th_closest_to_0_valence',
                     'spin_multiplicity':'9th_closest_to_0_spin_multiplicity',
                     'exact_mass': '9th_closest_to_0_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','10th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'10th_closest_to_0_atomic_number',
                     'atomic_mass':'10th_closest_to_0_atomic_mass',
                     'valence':'10th_closest_to_0_valence',
                     'spin_multiplicity':'10th_closest_to_0_spin_multiplicity',
                     'exact_mass': '10th_closest_to_0_exact_mass'})

CPU times: user 4min 51s, sys: 4min 22s, total: 9min 13s
Wall time: 9min 14s


In [32]:
%%time
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'closest_to_1_atomic_number',
                     'atomic_mass':'closest_to_1_atomic_mass',
                     'valence':'closest_to_1_valence',
                     'spin_multiplicity':'closest_to_1_spin_multiplicity',
                     'exact_mass': 'closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','2nd_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'2nd_closest_to_1_atomic_number',
                     'atomic_mass':'2nd_closest_to_1_atomic_mass',
                     'valence':'2nd_closest_to_1_valence',
                     'spin_multiplicity':'2nd_closest_to_1_spin_multiplicity',
                     'exact_mass': '2nd_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','3rd_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'3rd_closest_to_1_atomic_number',
                     'atomic_mass':'3rd_closest_to_1_atomic_mass',
                     'valence':'3rd_closest_to_1_valence',
                     'spin_multiplicity':'3rd_closest_to_1_spin_multiplicity',
                     'exact_mass': '3rd_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','4th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'4th_closest_to_1_atomic_number',
                     'atomic_mass':'4th_closest_to_1_atomic_mass',
                     'valence':'4th_closest_to_1_valence',
                     'spin_multiplicity':'4th_closest_to_1_spin_multiplicity',
                     'exact_mass': '4th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','5th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'5th_closest_to_1_atomic_number',
                     'atomic_mass':'5th_closest_to_1_atomic_mass',
                     'valence':'5th_closest_to_1_valence',
                     'spin_multiplicity':'5th_closest_to_1_spin_multiplicity',
                     'exact_mass': '5th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','6th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'6th_closest_to_1_atomic_number',
                     'atomic_mass':'6th_closest_to_1_atomic_mass',
                     'valence':'6th_closest_to_1_valence',
                     'spin_multiplicity':'6th_closest_to_1_spin_multiplicity',
                     'exact_mass': '6th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','7th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'7th_closest_to_1_atomic_number',
                     'atomic_mass':'7th_closest_to_1_atomic_mass',
                     'valence':'7th_closest_to_1_valence',
                     'spin_multiplicity':'7th_closest_to_1_spin_multiplicity',
                     'exact_mass': '7th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','8th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'8th_closest_to_1_atomic_number',
                     'atomic_mass':'8th_closest_to_1_atomic_mass',
                     'valence':'8th_closest_to_1_valence',
                     'spin_multiplicity':'8th_closest_to_1_spin_multiplicity',
                     'exact_mass': '8th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','9th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'9th_closest_to_1_atomic_number',
                     'atomic_mass':'9th_closest_to_1_atomic_mass',
                     'valence':'9th_closest_to_1_valence',
                     'spin_multiplicity':'9th_closest_to_1_spin_multiplicity',
                     'exact_mass': '9th_closest_to_1_exact_mass'})
tt = pd.merge(tt,
         atom_details,
         left_on=['molecule_name','10th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'10th_closest_to_1_atomic_number',
                     'atomic_mass':'10th_closest_to_1_atomic_mass',
                     'valence':'10th_closest_to_1_valence',
                     'spin_multiplicity':'10th_closest_to_1_spin_multiplicity',
                     'exact_mass': '10th_closest_to_1_exact_mass'})

CPU times: user 12min 14s, sys: 13min 10s, total: 25min 24s
Wall time: 25min 27s


## Torsion Details

In [33]:
%%time
tt = pd.merge(tt,
        torsion_details.groupby(['molecule_name','2left_atom_idx','left_atom_idx'])['torsion_angle'] \
                  .agg(['mean','min','max','count'])\
                  .reset_index(),
        left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','2left_atom_idx','left_atom_idx'],
         how='left') \
    .drop(['2left_atom_idx','left_atom_idx'], axis=1) \
    .rename(columns={'mean': 'tor_ang_2leftleft_mean',
                     'min': 'tor_ang_2leftleft_min',
                    'max': 'tor_ang_2leftleft_max',
                    'count': 'tor_ang_2leftleft_count'})
tt.shape

CPU times: user 1min 29s, sys: 1min 42s, total: 3min 11s
Wall time: 3min 12s


In [34]:
%%time
tt = pd.merge(tt,
         torsion_details.groupby(['molecule_name','2left_atom_idx','right_atom_idx'])['torsion_angle'] \
                  .agg(['mean','min','max','count'])\
                  .reset_index(),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','2left_atom_idx','right_atom_idx'],
         how='left') \
    .drop(['2left_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'mean': 'tor_ang_2leftright_mean',
                     'min': 'tor_ang_2leftright_min',
                     'max': 'tor_ang_2leftright_max',
                     'count': 'tor_ang_2leftright_count'})
tt.shape

CPU times: user 1min 30s, sys: 1min 49s, total: 3min 20s
Wall time: 3min 20s


In [35]:
molecule_details.head()

Unnamed: 0,molecule_name,mol_wt,num_atoms,num_bonds,num_residues
0,dsgdb9nsd_033805,117.14788,16,17,0
1,dsgdb9nsd_018833,110.1537,18,20,0
2,dsgdb9nsd_013391,115.13046,17,17,0
3,dsgdb9nsd_063427,123.19552,22,22,0
4,dsgdb9nsd_106472,125.16834,20,21,0


In [36]:
%%time
tt = pd.merge(tt,
        molecule_details.drop('num_residues', axis=1),
         left_on=['molecule_name'],
         right_on=['molecule_name'],
         how='left')

CPU times: user 1min 19s, sys: 1min 27s, total: 2min 47s
Wall time: 2min 47s


In [37]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [38]:
test_FE009 = tt.sort_values('id').loc[tt['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE009 = tt.sort_values('id').loc[~tt['scalar_coupling_constant'].isnull()].reset_index(drop=True)

In [39]:
test_FE009.shape

(2505542, 192)

In [40]:
test.shape

(2505542, 5)

In [41]:
train_FE009.shape

(4658147, 192)

In [42]:
train.shape

(4658147, 6)

In [43]:
bool_cols = [col for col in train_FE009.columns if 'is_bond_' in col]
test_FE009[bool_cols] = test_FE009[bool_cols].fillna(False)
train_FE009[bool_cols] = train_FE009[bool_cols].fillna(False)

In [44]:
test_FE009.to_parquet('../data/FE009_test_pandas.parquet')

  result = infer_dtype(pandas_collection)


In [45]:
train_FE009.to_parquet('../data/FE009_train_pandas.parquet')

  result = infer_dtype(pandas_collection)


In [46]:
[col for col in test_FE009.columns]

['id',
 'molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'atom0_atomic_mass',
 'atom0_atomic_number',
 'exact_mass_x',
 'atom0_valence',
 'atom0_spin_multiplicity',
 'atom1_atomic_mass',
 'atom1_atomic_number',
 'exact_mass_y',
 'atom1_valence',
 'atom1_spin_multiplicity',
 'left_middle_average_angle',
 'right_middle_average_angle',
 'distance',
 'is_bond',
 'closest_to_0',
 '2nd_closest_to_0',
 '3rd_closest_to_0',
 '4th_closest_to_0',
 '5th_closest_to_0',
 '6th_closest_to_0',
 '7th_closest_to_0',
 '8th_closest_to_0',
 '9th_closest_to_0',
 '10th_closest_to_0',
 'closest_to_1',
 '2nd_closest_to_1',
 '3rd_closest_to_1',
 '4th_closest_to_1',
 '5th_closest_to_1',
 '6th_closest_to_1',
 '7th_closest_to_1',
 '8th_closest_to_1',
 '9th_closest_to_1',
 '10th_closest_to_1',
 'is_closest_pair',
 'distance_closest_to_0',
 'is_bond_closest_to_0',
 'distance_2nd_closest_to_0',
 'is_bond_2nd_closest_to_0',
 'distance_3rd_closest_to_0',
 'is_bond_3rd_closest_to_