# Speed up g(r) code

I want to compare the timings for calculating the distances between many atoms.  This involves computing the double loop over all coordinates.

In [1]:
from sasmol import sasmol

In [2]:
import numpy as np

In [3]:
from scipy.spatial.distance import pdist

In [4]:
from distance import distance

In [5]:
import dask.array as da

In [6]:
# setup coordinates
pdb_fname = 'test.pdb'
dcd_fname = 'test.dcd'
mol = sasmol.SasMol(0)
mol.read_pdb(pdb_fname)

reading filename:  test.pdb
num_atoms =  2048
>>> found  1  model(s) or frame(s)
finished reading frame =  1


In [None]:
# confirm indexing
n = 3
d = np.zeros([n, n])
for j in xrange(3):
    for k in xrange(j+1, 3):
        d[j, k] = 1
print(d)

In [None]:
# confirm products
c1 = np.arange(3)
c2 = np.arange(3)/10.0
dsum = np.sqrt((c1[0] - c2[0]) ** 2  + (c1[1] - c2[1]) ** 2  + (c1[2] - c2[2]) ** 2)
nsum = np.sqrt(np.sum((c1 - c2) ** 2))
print(dsum, nsum)

In [None]:
%%timeit
# ignorant python
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])
for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    for j in xrange(n_atoms):
        for k in xrange(j+1, n_atoms):
            dist[i, j, k] = np.sqrt(np.sum((coor[j] - coor[k]) ** 2))
            
mol.close_dcd_read(dcd_file[0])

In [None]:
# ignorant python
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])
for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    for j in xrange(n_atoms):
        for k in xrange(j+1, n_atoms):
            dist[i, j, k] = np.sqrt(np.sum((coor[j] - coor[k]) ** 2))
            
mol.close_dcd_read(dcd_file[0])

dp = dist[0, :4, :4]

In [None]:
dp

In [None]:
%%timeit
# numpy version
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])
for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    dist[i] = np.sqrt(((coor[:, None, :] - coor) ** 2).sum(-1))
    
mol.close_dcd_read(dcd_file[0])

In [None]:
# numpy version
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])
for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    dist[i] = np.sqrt(((coor[:, None, :] - coor) ** 2).sum(-1))
    
mol.close_dcd_read(dcd_file[0])

dn = dist[0, :4, :4]

In [None]:
dp, dn

In [None]:
print(dp-dn)

Even though the NumPy version calculates the entire matrix, it is more than 2x faster.

In [None]:
%%timeit
# scipy version 1
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    row, col = np.triu_indices(n_atoms, 1)
    dist[i, row, col] = pdist(coor)
    
mol.close_dcd_read(dcd_file[0])

In [None]:
# scipy version 1
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    row, col = np.triu_indices(n_atoms, 1)
    dist[i, row, col] = pdist(coor)
    
mol.close_dcd_read(dcd_file[0])

ds1 = dist[0, :4, :4]

In [None]:
%%timeit
# scipy version 2 (with boolean-indexing)
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    r = np.arange(n_atoms)
    dist[i, r[:, None] < r] = pdist(coor)
    
mol.close_dcd_read(dcd_file[0])

In [None]:
# scipy version 2 (with boolean-indexing)
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    r = np.arange(n_atoms)
    dist[i, r[:, None] < r] = pdist(coor)
    
mol.close_dcd_read(dcd_file[0])

ds2 = dist[0, :4, :4]

In [None]:
dp, dn, ds1, ds2

In [None]:
print(dp-ds1)
print(dp-ds2)

In [None]:
%%timeit
# fortran version
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    dist[0] = distance(coor, dist[0])
    
mol.close_dcd_read(dcd_file[0])

In [None]:
# fortran version
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
# n_frames = dcd_file[2]
n_frames = 1

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])

for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor = mol.coor()[0]
    dist[0] = distance(coor, dist[0])
    
mol.close_dcd_read(dcd_file[0])

df = dist[0, :4, :4]

In [None]:
dp, df

In [None]:
df - dp

# Now with [Dask](http://dask.pydata.org/en/latest/)

In [18]:
# Dask with scipy version 2 (with boolean-indexing)
dcd_file = mol.open_dcd_read(dcd_fname)

n_atoms = mol.natoms()
n_frames = dcd_file[2]

print('number of frames: {}'.format(n_frames))
print('number of atoms: {}'.format(n_atoms))

dist = np.zeros([n_frames, n_atoms, n_atoms])
coor = []
for i in xrange(n_frames):
    mol.read_dcd_step(dcd_file, i)
    coor.append(mol.coor()[0])
    
mol.close_dcd_read(dcd_file[0])

number of frames: 1002
number of atoms: 2048
...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [None]:
%%timeit
d_coor = da.from_array(np.array(coor), chunks=(1, n_atoms, 3))

for i in xrange(n_frames):
    r = np.arange(n_atoms)
    dist[i, r[:, None] < r] = pdist(d_coor[i])
    
dd = dist[0, :4, :4]

In [16]:
%%timeit
n_coor = np.array(coor)

for i in xrange(n_frames):
    r = np.arange(n_atoms)
    dist[i, r[:, None] < r] = pdist(n_coor[i])
    
dd = dist[0, :4, :4]

1 loop, best of 3: 411 ms per loop


In [17]:
import dask