This is continuation of this kernel:
https://www.kaggle.com/seriousran/just-speed-up-calculate-distance-from-benchmark

simple numpy vectorized function gives another 30% +/- performance boost.

Added even faster method with np.einsum from this kernel:
https://www.kaggle.com/rakibilly/faster-distance-calculation-from-benchmark


In [1]:
import numpy as np # linear algebra
import scipy as sp
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os
print(os.listdir("../input"))

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('../input/train.csv', index_col='id')
test = pd.read_csv('../input/test.csv', index_col='id')

structures = pd.read_csv('../input/structures.csv')

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

['dipole_moments.csv', 'potential_energy.csv', 'magnetic_shielding_tensors.csv', 'structures.csv', 'scalar_coupling_contributions.csv', 'structures', 'sample_submission.csv', 'mulliken_charges.csv', 'test.csv', 'train.csv']


  mask |= (ar1 == a)


# Calculate distance

In [2]:
%%time
# Engineer a single feature: distance vector between atoms
#  (there's ways to speed this up!)

def dist(row):
    return ( (row['x_1'] - row['x_0'])**2 +
             (row['y_1'] - row['y_0'])**2 +
             (row['z_1'] - row['z_0'])**2 ) ** 0.5

#train['dist'] = train.apply(lambda x: dist(x), axis=1)
#test['dist'] = test.apply(lambda x: dist(x), axis=1)
# 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.5 µs


In [3]:
%%time
# This block is SPPED UP

train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

tr_a_min_b = train_p_0 - train_p_1
te_a_min_b = test_p_0 - test_p_1

CPU times: user 168 ms, sys: 164 ms, total: 332 ms
Wall time: 330 ms


In [14]:
%%time
train['dist_np_linalg'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist_np_linalg'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

CPU times: user 160 ms, sys: 288 ms, total: 448 ms
Wall time: 448 ms


In [12]:
%%time
# python vectorized: even faster!
train['dist_numpy'] = np.sqrt(np.sum((train_p_1 - train_p_0)**2, axis=1)) 
test['dist_numpy'] = np.sqrt(np.sum((test_p_1 - test_p_0)**2, axis=1))

CPU times: user 152 ms, sys: 188 ms, total: 340 ms
Wall time: 340 ms


In [9]:
%%time
train['dist_einsum'] = np.sqrt(np.einsum('ij,ij->i', tr_a_min_b, tr_a_min_b))
test['dist_einsum'] = np.sqrt(np.einsum('ij,ij->i', te_a_min_b, te_a_min_b))

CPU times: user 96 ms, sys: 88 ms, total: 184 ms
Wall time: 180 ms


In [7]:
train.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist_speedup,dist_numpy,dist_einsum
0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.091953,1.091953,1.091953
1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.78312,1.78312,1.78312
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1.783147,1.783147,1.783147
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1.783157,1.783157,1.783157
4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.091952,1.091952,1.091952
