# FE009 - More OpenBabel Features
- Add more neighbors
- Add weight of neightbor atoms

In [None]:
import dask.dataframe as dd
import pandas as pd
import matplotlib.pylab as plt

In [None]:
train = dd.read_csv('../input/train.csv')
test = dd.read_csv('../input/test.csv')

In [None]:
%%time
angle_details = dd.read_csv('../data/openbabel/angle_details.csv')
atom_details = dd.read_csv('../data/openbabel/atom_details.csv')
bond_details = dd.read_csv('../data/openbabel/bond_details.csv')
closest = dd.read_csv('../data/openbabel/closest.csv',
                         dtype={'atom_index':'float64',
                                '10th_closest': 'float64',
                               '7th_closest': 'float64',
                               '8th_closest': 'float64',
                               '9th_closest': 'float64',
                               '10th_closest': 'float64',
                               '6th_closest': 'float64',
                               '5th_closest': 'float64',
                               '4th_closest': 'float64',
                               '3rd_closest': 'float64',
                               '2nd_closest': 'float64',
                               'closest': 'float64',
})
distances = dd.read_csv('../data/openbabel/distances.csv')
molecule_details = dd.read_csv('../data/openbabel/molecule_details.csv')
torsion_details = dd.read_csv('../data/openbabel/torsion_details.csv')

In [None]:
tt = dd.concat([train, test], axis=0)

In [None]:
tt.shape

In [None]:
tt.head()

# Atom Details

In [None]:
atom_details.head()

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','atom_index_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'atom0_atomic_number',
                     'atomic_mass':'atom0_atomic_mass',
                     'valence':'atom0_valence',
                     'spin_multiplicity':'atom0_spin_multiplicity'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','atom_index_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'atom1_atomic_number',
                     'atomic_mass':'atom1_atomic_mass',
                     'valence':'atom1_valence',
                     'spin_multiplicity':'atom1_spin_multiplicity'})

In [None]:
# print(len(tt))

# Angles

In [None]:
%%time
tt = dd.merge(tt,
         angle_details.groupby(['molecule_name','left_atom_idx','middle_atom_idx']).mean() \
              .reset_index(drop=False),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','left_atom_idx','middle_atom_idx'],
         how='left') \
    .drop(['left_atom_idx','middle_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'angle':'left_middle_average_angle'})

In [None]:
%%time
tt = dd.merge(tt,
         angle_details.groupby(['molecule_name','right_atom_idx','middle_atom_idx']).mean() \
              .reset_index(drop=False),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','right_atom_idx','middle_atom_idx'],
         how='left') \
    .drop(['left_atom_idx','middle_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'angle':'right_middle_average_angle'})

In [None]:
%%time
tt = dd.merge(tt,
         distances,
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','left_atom_idx','right_atom_idx'],
         how='left')\
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

# Closest Neighbors

In [None]:
# Fix bug with the indexes being off by one
if closest['closest'].min() == 1:
    closest['atom_index'] = closest['atom_index'] - 1
    for col in closest.columns:
        if 'close' in col:
            closest[col] = closest[col] - 1

In [None]:
%%time
tt = dd.merge(tt,
         closest[['molecule_name','atom_index','closest','2nd_closest','3rd_closest','4th_closest',
                  '5th_closest', '6th_closest','7th_closest','8th_closest','9th_closest','10th_closest']],
        left_on=['molecule_name','atom_index_0'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'closest':'closest_to_0',
                     '2nd_closest':'2nd_closest_to_0',
                     '3rd_closest':'3rd_closest_to_0',
                     '4th_closest':'4th_closest_to_0',
                     '5th_closest':'5th_closest_to_0',
                     '6th_closest':'6th_closest_to_0',
                     '7th_closest':'7th_closest_to_0',
                     '8th_closest':'8th_closest_to_0',
                     '9th_closest':'9th_closest_to_0',
                     '10th_closest':'10th_closest_to_0'})

In [None]:
%%time
tt = dd.merge(tt,
         closest[['molecule_name','atom_index','closest','2nd_closest','3rd_closest','4th_closest',
                  '5th_closest', '6th_closest','7th_closest','8th_closest','9th_closest','10th_closest']],
        left_on=['molecule_name','atom_index_1'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'closest':'closest_to_1',
                     '2nd_closest':'2nd_closest_to_1',
                     '3rd_closest':'3rd_closest_to_1',
                     '4th_closest':'4th_closest_to_1',
                     '5th_closest':'5th_closest_to_1',
                     '6th_closest':'6th_closest_to_1',
                     '7th_closest':'7th_closest_to_1',
                     '8th_closest':'8th_closest_to_1',
                     '9th_closest':'9th_closest_to_1',
                     '10th_closest':'10th_closest_to_1'})

In [None]:
tt['is_closest_pair'] = tt['closest_to_0'] == tt['atom_index_1']

# Distance to neighbors

## To atom0

In [None]:
%%time
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

In [None]:
%%time
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','2nd_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_2nd_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

In [None]:
%%time
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','3rd_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_3rd_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','4th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_4th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','5th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_5th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','6th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_6th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','7th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_7th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)

In [None]:
%%time
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','8th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_8th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','9th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_9th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt = dd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0','10th_closest_to_0'],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_10th_closest_to_0'),
             how='left') \
    .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt.shape

## To atom1

In [None]:
%%time
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', 'closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '2nd_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_2nd_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '3rd_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_3rd_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '4th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_4th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)

In [None]:
%%time
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '5th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_5th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '6th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_6th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '7th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_7th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)

In [None]:
%%time
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '8th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_8th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '9th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_9th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt = dd.merge(tt,
              distances,
              left_on=['molecule_name', 'atom_index_1', '10th_closest_to_1'],
              right_on=['molecule_name', 'left_atom_idx', 'right_atom_idx'],
              suffixes=('', '_10th_closest_to_1'),
              how='left') \
    .drop(['left_atom_idx', 'right_atom_idx'], axis=1)
tt.shape

# Atom Details of Neighbors

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'closest_to_0_atomic_number',
                     'atomic_mass':'closest_to_0_atomic_mass',
                     'valence':'closest_to_0_valence',
                     'spin_multiplicity':'closest_to_0_spin_multiplicity',
                     'exact_mass': 'closest_to_0_exact_mass'})
tt.shape

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','2nd_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'2nd_closest_to_0_atomic_number',
                     'atomic_mass':'2nd_closest_to_0_atomic_mass',
                     'valence':'2nd_closest_to_0_valence',
                     'spin_multiplicity':'2nd_closest_to_0_spin_multiplicity',
                     'exact_mass': '2nd_closest_to_0_exact_mass'})
tt.shape

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','3rd_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'3rd_closest_to_0_atomic_number',
                     'atomic_mass':'3rd_closest_to_0_atomic_mass',
                     'valence':'3rd_closest_to_0_valence',
                     'spin_multiplicity':'3rd_closest_to_0_spin_multiplicity',
                     'exact_mass': '3rd_closest_to_0_exact_mass'})
tt.shape

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','4th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'4th_closest_to_0_atomic_number',
                     'atomic_mass':'4th_closest_to_0_atomic_mass',
                     'valence':'4th_closest_to_0_valence',
                     'spin_multiplicity':'4th_closest_to_0_spin_multiplicity',
                     'exact_mass': '4th_closest_to_0_exact_mass'})
tt.shape

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','5th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'5th_closest_to_0_atomic_number',
                     'atomic_mass':'5th_closest_to_0_atomic_mass',
                     'valence':'5th_closest_to_0_valence',
                     'spin_multiplicity':'5th_closest_to_0_spin_multiplicity',
                     'exact_mass': '5th_closest_to_0_exact_mass'})

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','6th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'6th_closest_to_0_atomic_number',
                     'atomic_mass':'6th_closest_to_0_atomic_mass',
                     'valence':'6th_closest_to_0_valence',
                     'spin_multiplicity':'6th_closest_to_0_spin_multiplicity',
                     'exact_mass': '6th_closest_to_0_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','7th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'7th_closest_to_0_atomic_number',
                     'atomic_mass':'7th_closest_to_0_atomic_mass',
                     'valence':'7th_closest_to_0_valence',
                     'spin_multiplicity':'7th_closest_to_0_spin_multiplicity',
                     'exact_mass': '7th_closest_to_0_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','8th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'8th_closest_to_0_atomic_number',
                     'atomic_mass':'8th_closest_to_0_atomic_mass',
                     'valence':'8th_closest_to_0_valence',
                     'spin_multiplicity':'8th_closest_to_0_spin_multiplicity',
                     'exact_mass': '8th_closest_to_0_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','9th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'9th_closest_to_0_atomic_number',
                     'atomic_mass':'9th_closest_to_0_atomic_mass',
                     'valence':'9th_closest_to_0_valence',
                     'spin_multiplicity':'9th_closest_to_0_spin_multiplicity',
                     'exact_mass': '9th_closest_to_0_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','10th_closest_to_0'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'10th_closest_to_0_atomic_number',
                     'atomic_mass':'10th_closest_to_0_atomic_mass',
                     'valence':'10th_closest_to_0_valence',
                     'spin_multiplicity':'10th_closest_to_0_spin_multiplicity',
                     'exact_mass': '10th_closest_to_0_exact_mass'})

In [None]:
%%time
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'closest_to_1_atomic_number',
                     'atomic_mass':'closest_to_1_atomic_mass',
                     'valence':'closest_to_1_valence',
                     'spin_multiplicity':'closest_to_1_spin_multiplicity',
                     'exact_mass': 'closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','2nd_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'2nd_closest_to_1_atomic_number',
                     'atomic_mass':'2nd_closest_to_1_atomic_mass',
                     'valence':'2nd_closest_to_1_valence',
                     'spin_multiplicity':'2nd_closest_to_1_spin_multiplicity',
                     'exact_mass': '2nd_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','3rd_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'3rd_closest_to_1_atomic_number',
                     'atomic_mass':'3rd_closest_to_1_atomic_mass',
                     'valence':'3rd_closest_to_1_valence',
                     'spin_multiplicity':'3rd_closest_to_1_spin_multiplicity',
                     'exact_mass': '3rd_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','4th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'4th_closest_to_1_atomic_number',
                     'atomic_mass':'4th_closest_to_1_atomic_mass',
                     'valence':'4th_closest_to_1_valence',
                     'spin_multiplicity':'4th_closest_to_1_spin_multiplicity',
                     'exact_mass': '4th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','5th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'5th_closest_to_1_atomic_number',
                     'atomic_mass':'5th_closest_to_1_atomic_mass',
                     'valence':'5th_closest_to_1_valence',
                     'spin_multiplicity':'5th_closest_to_1_spin_multiplicity',
                     'exact_mass': '5th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','6th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'6th_closest_to_1_atomic_number',
                     'atomic_mass':'6th_closest_to_1_atomic_mass',
                     'valence':'6th_closest_to_1_valence',
                     'spin_multiplicity':'6th_closest_to_1_spin_multiplicity',
                     'exact_mass': '6th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','7th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'7th_closest_to_1_atomic_number',
                     'atomic_mass':'7th_closest_to_1_atomic_mass',
                     'valence':'7th_closest_to_1_valence',
                     'spin_multiplicity':'7th_closest_to_1_spin_multiplicity',
                     'exact_mass': '7th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','8th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'8th_closest_to_1_atomic_number',
                     'atomic_mass':'8th_closest_to_1_atomic_mass',
                     'valence':'8th_closest_to_1_valence',
                     'spin_multiplicity':'8th_closest_to_1_spin_multiplicity',
                     'exact_mass': '8th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','9th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'9th_closest_to_1_atomic_number',
                     'atomic_mass':'9th_closest_to_1_atomic_mass',
                     'valence':'9th_closest_to_1_valence',
                     'spin_multiplicity':'9th_closest_to_1_spin_multiplicity',
                     'exact_mass': '9th_closest_to_1_exact_mass'})
tt = dd.merge(tt,
         atom_details,
         left_on=['molecule_name','10th_closest_to_1'],
         right_on=['molecule_name','atom_idx'],
         how='left') \
    .drop(['atom_idx'], axis=1) \
    .rename(columns={'atomic_number':'10th_closest_to_1_atomic_number',
                     'atomic_mass':'10th_closest_to_1_atomic_mass',
                     'valence':'10th_closest_to_1_valence',
                     'spin_multiplicity':'10th_closest_to_1_spin_multiplicity',
                     'exact_mass': '10th_closest_to_1_exact_mass'})

## Torsion Details

In [None]:
%%time
tt = dd.merge(tt,
        torsion_details.groupby(['molecule_name','2left_atom_idx','left_atom_idx'])['torsion_angle'] \
                  .agg(['mean','min','max','count'])\
                  .reset_index(),
        left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','2left_atom_idx','left_atom_idx'],
         how='left') \
    .drop(['2left_atom_idx','left_atom_idx'], axis=1) \
    .rename(columns={'mean': 'tor_ang_2leftleft_mean',
                     'min': 'tor_ang_2leftleft_min',
                    'max': 'tor_ang_2leftleft_max',
                    'count': 'tor_ang_2leftleft_count'})

In [None]:
%%time
tt = dd.merge(tt,
         torsion_details.groupby(['molecule_name','2left_atom_idx','right_atom_idx'])['torsion_angle'] \
                  .agg(['mean','min','max','count'])\
                  .reset_index(),
         left_on=['molecule_name','atom_index_0','atom_index_1'],
         right_on=['molecule_name','2left_atom_idx','right_atom_idx'],
         how='left') \
    .drop(['2left_atom_idx','right_atom_idx'], axis=1) \
    .rename(columns={'mean': 'tor_ang_2leftright_mean',
                     'min': 'tor_ang_2leftright_min',
                     'max': 'tor_ang_2leftright_max',
                     'count': 'tor_ang_2leftright_count'})

In [None]:
molecule_details.head()

In [None]:
%%time
tt = dd.merge(tt,
        molecule_details.drop('num_residues', axis=1),
         left_on=['molecule_name'],
         right_on=['molecule_name'],
         how='left')

In [None]:
test.head()

In [None]:
%%time
from dask.diagnostics import ProgressBar
with ProgressBar():
    tt2 = tt.compute()


In [None]:
tt2.to_parquet('../data/FE009_tt.parquet')

In [None]:
test_FE009 = tt.sort_values('id').loc[tt['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE009 = tt.sort_values('id').loc[~tt['scalar_coupling_constant'].isnull()].reset_index(drop=True)

In [None]:
test_FE009.shape

In [None]:
test.shape

In [None]:
train_FE009.shape

In [None]:
train.shape

In [None]:
bool_cols = [col for col in train_FE009.columns if 'is_bond_' in col]
test_FE009[bool_cols] = test_FE009[bool_cols].fillna(False)
train_FE009[bool_cols] = train_FE009[bool_cols].fillna(False)

In [None]:
test_FE009.to_parquet('../data/FE009_test.parquet')

In [None]:
train_FE009.to_parquet('../data/FE009_train.parquet')

In [None]:
[col for col in test_FE009.columns]