# FE010 - More OpenBabel Features
- Add ALL neighbors
- Add weight of neightbor atoms

- Merge with previously calculated version

- Neighbors 0 to 10 have already been calculated in FE009 so only do the others

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
%%time
angle_details = pd.read_csv('../data/openbabel/angle_details.csv')
atom_details = pd.read_csv('../data/openbabel/atom_details.csv')
bond_details = pd.read_csv('../data/openbabel/bond_details.csv')
closest = pd.read_csv('../data/openbabel/closest.csv')
distances = pd.read_csv('../data/openbabel/distances.csv')
molecule_details = pd.read_csv('../data/openbabel/molecule_details.csv')
torsion_details = pd.read_csv('../data/openbabel/torsion_details.csv')

CPU times: user 43.9 s, sys: 2.72 s, total: 46.7 s
Wall time: 46.7 s


In [4]:
tt = pd.concat([train, test], axis=0, sort=False)

In [5]:
tt.shape

(7163689, 6)

In [6]:
tt.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [7]:
closest.head()

Unnamed: 0,molecule_name,atom_index,closest,2nd_closest,3rd_closest,4th_closest,5th_closest,6th_closest,7th_closest,8th_closest,9th_closest,10th_closest,11th_closest,12th_closest,13th_closest,14th_closest,15th_closest,16th_closest,17th_closest,18th_closest,19th_closest,20th_closest,21st_closest,22nd_closest,23rd_closest,24th_closest,25th_closest,26th_closest,27th_closest,28th_closest
0,dsgdb9nsd_033805,1,2,3,7.0,9.0,4.0,14.0,16.0,11.0,10.0,6.0,8.0,5.0,13.0,15.0,12.0,,,,,,,,,,,,,
1,dsgdb9nsd_033805,2,1,3,9.0,7.0,4.0,16.0,14.0,11.0,10.0,8.0,6.0,5.0,15.0,13.0,12.0,,,,,,,,,,,,,
2,dsgdb9nsd_033805,3,2,9,7.0,4.0,10.0,11.0,5.0,8.0,6.0,16.0,14.0,1.0,12.0,15.0,13.0,,,,,,,,,,,,,
3,dsgdb9nsd_033805,4,11,10,5.0,3.0,12.0,6.0,8.0,9.0,7.0,2.0,13.0,15.0,16.0,14.0,1.0,,,,,,,,,,,,,
4,dsgdb9nsd_033805,5,12,6,8.0,4.0,10.0,11.0,3.0,7.0,9.0,13.0,15.0,14.0,16.0,2.0,1.0,,,,,,,,,,,,,


# Closest Neighbors
11th to 28th

In [8]:
# Fix bug with the indexes being off by one
if closest['closest'].min() == 1:
    closest['atom_index'] = closest['atom_index'] - 1
    for col in closest.columns:
        if 'close' in col:
            closest[col] = closest[col] - 1

In [9]:
ngbrs_to_run = [col for col in closest.columns if 'closest' in col][10:]
ngbrs_to_run

['11th_closest',
 '12th_closest',
 '13th_closest',
 '14th_closest',
 '15th_closest',
 '16th_closest',
 '17th_closest',
 '18th_closest',
 '19th_closest',
 '20th_closest',
 '21st_closest',
 '22nd_closest',
 '23rd_closest',
 '24th_closest',
 '25th_closest',
 '26th_closest',
 '27th_closest',
 '28th_closest']

In [10]:
%%time
tt = pd.merge(tt,
         closest[['molecule_name','atom_index','11th_closest',
                 '12th_closest',
                 '13th_closest',
                 '14th_closest',
                 '15th_closest',
                 '16th_closest',
                 '17th_closest',
                 '18th_closest',
                 '19th_closest',
                 '20th_closest',
                 '21st_closest',
                 '22nd_closest',
                 '23rd_closest',
                 '24th_closest',
                 '25th_closest',
                 '26th_closest',
                 '27th_closest',
                 '28th_closest']],
        left_on=['molecule_name','atom_index_0'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'11th_closest':'11th_closest_to_0',
                     '12th_closest':'12th_closest_to_0',
                     '13th_closest':'13th_closest_to_0',
                     '14th_closest':'14th_closest_to_0',
                     '15th_closest':'15th_closest_to_0',
                     '16th_closest':'16th_closest_to_0',
                     '17th_closest':'17th_closest_to_0',
                     '18th_closest':'18th_closest_to_0',
                     '19th_closest':'19th_closest_to_0',
                     '20th_closest':'20th_closest_to_0',
                     '21st_closest':'21st_closest_to_0',
                     '22nd_closest':'22nd_closest_to_0',
                     '23rd_closest':'23rd_closest_to_0',
                     '24th_closest':'24th_closest_to_0',
                     '25th_closest':'25th_closest_to_0',
                     '26th_closest':'26th_closest_to_0',
                     '27th_closest':'27th_closest_to_0',
                     '28th_closest':'28th_closest_to_0'})

CPU times: user 2.78 s, sys: 2.48 s, total: 5.26 s
Wall time: 5.27 s


In [10]:
%%time
tt = pd.merge(tt,
         closest[['molecule_name','atom_index','11th_closest',
                 '12th_closest',
                 '13th_closest',
                 '14th_closest',
                 '15th_closest',
                 '16th_closest',
                 '17th_closest',
                 '18th_closest',
                 '19th_closest',
                 '20th_closest',
                 '21st_closest',
                 '22nd_closest',
                 '23rd_closest',
                 '24th_closest',
                 '25th_closest',
                 '26th_closest',
                 '27th_closest',
                 '28th_closest']],
        left_on=['molecule_name','atom_index_1'],
        right_on=['molecule_name','atom_index'],
        how='left') \
    .drop(['atom_index'], axis=1) \
    .rename(columns={'11th_closest':'11th_closest_to_1',
                     '12th_closest':'12th_closest_to_1',
                     '13th_closest':'13th_closest_to_1',
                     '14th_closest':'14th_closest_to_1',
                     '15th_closest':'15th_closest_to_1',
                     '16th_closest':'16th_closest_to_1',
                     '17th_closest':'17th_closest_to_1',
                     '18th_closest':'18th_closest_to_1',
                     '19th_closest':'19th_closest_to_1',
                     '20th_closest':'20th_closest_to_1',
                     '21st_closest':'21st_closest_to_1',
                     '22nd_closest':'22nd_closest_to_1',
                     '23rd_closest':'23rd_closest_to_1',
                     '24th_closest':'24th_closest_to_1',
                     '25th_closest':'25th_closest_to_1',
                     '26th_closest':'26th_closest_to_1',
                     '27th_closest':'27th_closest_to_1',
                     '28th_closest':'28th_closest_to_1'})

CPU times: user 2.61 s, sys: 2.4 s, total: 5.01 s
Wall time: 5.02 s


In [11]:
tt.shape

(7163689, 24)

# Distance to neighbors

## To atom0

In [13]:
clostest_0_list = ['11th_closest_to_0',
'12th_closest_to_0',
'13th_closest_to_0',
'14th_closest_to_0',
'15th_closest_to_0',
'16th_closest_to_0',
'17th_closest_to_0',
'18th_closest_to_0',
'19th_closest_to_0',
'20th_closest_to_0',
'21st_closest_to_0',
'22nd_closest_to_0',
'23rd_closest_to_0',
'24th_closest_to_0',
'25th_closest_to_0',
'26th_closest_to_0',
'27th_closest_to_0',
'28th_closest_to_0']

In [14]:
%%time
for closest in tqdm(clostest_0_list):
    tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_0', closest],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_{}'.format(closest)),
             how='left') \
        .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt.shape

100%|██████████| 18/18 [13:23<00:00, 69.98s/it]

CPU times: user 8min 43s, sys: 4min 38s, total: 13min 22s
Wall time: 13min 23s





In [15]:
# Save this off and clear memory
tt.to_parquet('FE010-temp/part1.parquet')

## To atom1

In [12]:
clostest_1_list = ['11th_closest_to_1',
                    '12th_closest_to_1',
                    '13th_closest_to_1',
                    '14th_closest_to_1',
                    '15th_closest_to_1',
                    '16th_closest_to_1',
                    '17th_closest_to_1',
                    '18th_closest_to_1',
                    '19th_closest_to_1',
                    '20th_closest_to_1',
                    '21st_closest_to_1',
                    '22nd_closest_to_1',
                    '23rd_closest_to_1',
                    '24th_closest_to_1',
                    '25th_closest_to_1',
                    '26th_closest_to_1',
                    '27th_closest_to_1',
                    '28th_closest_to_1']

for closest in tqdm(clostest_1_list):
    tt = pd.merge(tt,
        distances,
        left_on=['molecule_name','atom_index_1', closest],
        right_on=['molecule_name','left_atom_idx','right_atom_idx'],
        suffixes=('','_{}'.format(closest)),
             how='left') \
        .drop(['left_atom_idx','right_atom_idx'], axis=1)
tt.shape

100%|██████████| 18/18 [12:06<00:00, 61.90s/it]


(7163689, 60)

In [14]:
# Save this off and clear memory
tt.to_parquet('FE010-temp/part2.parquet')

# Atom Details of Neighbors

In [13]:
%%time

clostest_0_list = ['11th_closest_to_0',
'12th_closest_to_0',
'13th_closest_to_0',
'14th_closest_to_0',
'15th_closest_to_0',
'16th_closest_to_0',
'17th_closest_to_0',
'18th_closest_to_0',
'19th_closest_to_0',
'20th_closest_to_0',
'21st_closest_to_0',
'22nd_closest_to_0',
'23rd_closest_to_0',
'24th_closest_to_0',
'25th_closest_to_0',
'26th_closest_to_0',
'27th_closest_to_0',
'28th_closest_to_0']

for closest in tqdm(clostest_0_list):
    tt = pd.merge(tt,
             atom_details,
             left_on=['molecule_name', closest],
             right_on=['molecule_name','atom_idx'],
             how='left') \
        .drop(['atom_idx'], axis=1) \
        .rename(columns={'atomic_number':'{}_atomic_number'.format(closest),
                         'atomic_mass':'{}_atomic_mass'.format(closest),
                         'valence':'{}_valence'.format(closest),
                         'spin_multiplicity':'{}_spin_multiplicity'.format(closest),
                         'exact_mass': '{}_exact_mass'.format(closest)})
tt.shape

100%|██████████| 18/18 [03:59<00:00, 19.59s/it]

CPU times: user 1min 31s, sys: 2min 27s, total: 3min 59s
Wall time: 3min 59s





In [14]:
tt.to_parquet('FE010-temp/part3.parquet')

## Closest to 1

In [12]:
%%time
clostest_1_list = ['11th_closest_to_1',
                    '12th_closest_to_1',
                    '13th_closest_to_1',
                    '14th_closest_to_1',
                    '15th_closest_to_1',
                    '16th_closest_to_1',
                    '17th_closest_to_1',
                    '18th_closest_to_1',
                    '19th_closest_to_1',
                    '20th_closest_to_1',
                    '21st_closest_to_1',
                    '22nd_closest_to_1',
                    '23rd_closest_to_1',
                    '24th_closest_to_1',
                    '25th_closest_to_1',
                    '26th_closest_to_1',
                    '27th_closest_to_1',
                    '28th_closest_to_1']

for closest in tqdm(clostest_1_list):
    tt = pd.merge(tt,
             atom_details,
             left_on=['molecule_name', closest],
             right_on=['molecule_name','atom_idx'],
             how='left') \
        .drop(['atom_idx'], axis=1) \
        .rename(columns={'atomic_number':'{}_atomic_number'.format(closest),
                         'atomic_mass':'{}_atomic_mass'.format(closest),
                         'valence':'{}_valence'.format(closest),
                         'spin_multiplicity':'{}_spin_multiplicity'.format(closest),
                         'exact_mass': '{}_exact_mass'.format(closest)})
tt.shape

100%|██████████| 18/18 [03:42<00:00, 17.98s/it]

CPU times: user 1min 27s, sys: 2min 14s, total: 3min 42s
Wall time: 3min 42s





In [13]:
tt.to_parquet('FE010-temp/part4.parquet')

# Reset - load all saved parquet files and merge

In [1]:
import pandas as pd
ttpart1 = pd.read_parquet('FE010-temp/part1.parquet')
test_FE010_part1 = ttpart1.sort_values('id').loc[ttpart1['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE010_part1 = ttpart1.sort_values('id').loc[~ttpart1['scalar_coupling_constant'].isnull()].reset_index(drop=True)
test_FE010_part1.to_parquet('FE010-temp/test_part1.parquet')
train_FE010_part1.to_parquet('FE010-temp/train_part1.parquet')

In [1]:
import pandas as pd
ttpart2 = pd.read_parquet('FE010-temp/part2.parquet')
test_FE010_part2 = ttpart2.sort_values('id').loc[ttpart2['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE010_part2 = ttpart2.sort_values('id').loc[~ttpart2['scalar_coupling_constant'].isnull()].reset_index(drop=True)
test_FE010_part2.to_parquet('FE010-temp/test_part2.parquet')
train_FE010_part2.to_parquet('FE010-temp/train_part2.parquet')

In [1]:
import pandas as pd
ttpart3 = pd.read_parquet('FE010-temp/part3.parquet')
test_FE010_part3 = ttpart3.sort_values('id').loc[ttpart3['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE010_part3 = ttpart3.sort_values('id').loc[~ttpart3['scalar_coupling_constant'].isnull()].reset_index(drop=True)
test_FE010_part3.to_parquet('FE010-temp/test_part3.parquet')
train_FE010_part3.to_parquet('FE010-temp/train_part3.parquet')

In [1]:
import pandas as pd
ttpart4 = pd.read_parquet('FE010-temp/part4.parquet')
test_FE010_part4 = ttpart4.sort_values('id').loc[ttpart4['scalar_coupling_constant'].isnull()].reset_index(drop=True)
train_FE010_part4 = ttpart4.sort_values('id').loc[~ttpart4['scalar_coupling_constant'].isnull()].reset_index(drop=True)
test_FE010_part4.to_parquet('FE010-temp/test_part4.parquet')
train_FE010_part4.to_parquet('FE010-temp/train_part4.parquet')

# Merge Train Files including FE009

In [1]:
import pandas as pd
train009 = pd.read_parquet('../data/FE009_train_pandas.parquet')
train_p1 = pd.read_parquet('FE010-temp/train_part1.parquet')
train_p2 = pd.read_parquet('FE010-temp/train_part2.parquet')
train_p3 = pd.read_parquet('FE010-temp/train_part3.parquet')
train_p4 = pd.read_parquet('FE010-temp/train_part4.parquet')

In [3]:
train009 = pd.read_parquet('../data/FE009_train_pandas.parquet')
types = train009['type'].unique().tolist()

In [4]:
print(types)

['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']


In [18]:
types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']
import pandas as pd
from tqdm import tqdm

for t in tqdm(types):
    train009 = pd.read_parquet('../data/FE009_train_pandas.parquet')
    train_p1 = pd.read_parquet('FE010-temp/train_part1.parquet')
    train_p2 = pd.read_parquet('FE010-temp/train_part2.parquet')
    train_p3 = pd.read_parquet('FE010-temp/train_part3.parquet')
    train_p4 = pd.read_parquet('FE010-temp/train_part4.parquet')
    train009 = train009.loc[train009['type'] == t]
    train_p1 = train_p1.loc[train_p1['type'] == t]
    train_p2 = train_p2.loc[train_p2['type'] == t]
    train_p3 = train_p3.loc[train_p3['type'] == t]
    train_p4 = train_p4.loc[train_p4['type'] == t]
    # Find columns from p1 not in 009
    train009 = pd.concat([train009, train_p1[[col for col in train_p1.columns if col not in train009.columns]]], sort=False, axis=1)
    train009 = pd.concat([train009, train_p2[[col for col in train_p2.columns if col not in train009.columns]]], sort=False, axis=1)
    train009 = pd.concat([train009, train_p3[[col for col in train_p3.columns if col not in train009.columns]]], sort=False, axis=1)
    train009 = pd.concat([train009, train_p4[[col for col in train_p4.columns if col not in train009.columns]]], sort=False, axis=1)
    
    bool_cols = [col for col in train009.columns if 'is_bond_' in col]
    train009[bool_cols] = train009[bool_cols].fillna(False)
    train009[bool_cols] = train009[bool_cols].fillna(False)
    train009.to_parquet('../data/FE010-train-{}.parquet'.format(t))

100%|██████████| 8/8 [08:03<00:00, 63.64s/it]


# Merge Test and save

In [1]:
types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']
import pandas as pd
from tqdm import tqdm

for t in tqdm(types):
    test009 = pd.read_parquet('../data/FE009_test_pandas.parquet')
    test_p1 = pd.read_parquet('FE010-temp/test_part1.parquet')
    test_p2 = pd.read_parquet('FE010-temp/test_part2.parquet')
    test_p3 = pd.read_parquet('FE010-temp/test_part3.parquet')
    test_p4 = pd.read_parquet('FE010-temp/test_part4.parquet')
    test009 = test009.loc[test009['type'] == t]
    test_p1 = test_p1.loc[test_p1['type'] == t]
    test_p2 = test_p2.loc[test_p2['type'] == t]
    test_p3 = test_p3.loc[test_p3['type'] == t]
    test_p4 = test_p4.loc[test_p4['type'] == t]
    # Find columns from p1 not in 009
    test009 = pd.concat([test009, test_p1[[col for col in test_p1.columns if col not in test009.columns]]], sort=False, axis=1)
    test009 = pd.concat([test009, test_p2[[col for col in test_p2.columns if col not in test009.columns]]], sort=False, axis=1)
    test009 = pd.concat([test009, test_p3[[col for col in test_p3.columns if col not in test009.columns]]], sort=False, axis=1)
    test009 = pd.concat([test009, test_p4[[col for col in test_p4.columns if col not in test009.columns]]], sort=False, axis=1)
    
    bool_cols = [col for col in test009.columns if 'is_bond_' in col]
    test009[bool_cols] = test009[bool_cols].fillna(False)
    test009[bool_cols] = test009[bool_cols].fillna(False)
    test009.to_parquet('../data/FE010-test-{}.parquet'.format(t))

100%|██████████| 8/8 [03:53<00:00, 30.68s/it]


In [13]:
[x for x in test009.columns]

['id',
 'molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'atom0_atomic_mass',
 'atom0_atomic_number',
 'exact_mass_x',
 'atom0_valence',
 'atom0_spin_multiplicity',
 'atom1_atomic_mass',
 'atom1_atomic_number',
 'exact_mass_y',
 'atom1_valence',
 'atom1_spin_multiplicity',
 'left_middle_average_angle',
 'right_middle_average_angle',
 'distance',
 'is_bond',
 'closest_to_0',
 '2nd_closest_to_0',
 '3rd_closest_to_0',
 '4th_closest_to_0',
 '5th_closest_to_0',
 '6th_closest_to_0',
 '7th_closest_to_0',
 '8th_closest_to_0',
 '9th_closest_to_0',
 '10th_closest_to_0',
 'closest_to_1',
 '2nd_closest_to_1',
 '3rd_closest_to_1',
 '4th_closest_to_1',
 '5th_closest_to_1',
 '6th_closest_to_1',
 '7th_closest_to_1',
 '8th_closest_to_1',
 '9th_closest_to_1',
 '10th_closest_to_1',
 'is_closest_pair',
 'distance_closest_to_0',
 'is_bond_closest_to_0',
 'distance_2nd_closest_to_0',
 'is_bond_2nd_closest_to_0',
 'distance_3rd_closest_to_0',
 'is_bond_3rd_closest_to_

# Merge with FE009

In [None]:
# test_FE009 = pd.read_parquet('')
# train_FE009 =