In [1]:
from __future__ import division
import os, sys

where = "/afs/cern.ch/user/o/oeftiger/w/private/"
BIN = where + "git/PyHEADTAIL/"
sys.path.append(BIN)
BIN = where + "git/PyPIC-experimental/"
sys.path.append(BIN)
BIN = where + "git"
sys.path.append(BIN)

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from scipy.constants import e

np.random.seed(0)

os.environ["CUDA_DEVICE"] = "3"

from pycuda.autoinit import context
from pycuda import gpuarray
from pycuda import cumath
from pycuda import driver
from pycuda.compiler import SourceModule

import PyHEADTAIL
from PyHEADTAIL import gpu

from PyCERNmachines.CERNmachines import SPS

from PyPIC import pypic, meshing
from PyPIC.poisson_solver import FD_solver, FFT_solver

PyHEADTAIL v1.4.1-15-ga0ef233505-dirty




In [3]:
# general simulation parameters
n_particles = 1024*1024
nx = ny = 64
nz = 32

# beam parameters
intensity = 0.5 * 2.5e11
epsn_x = 2.5e-6
epsn_y = 2.5e-6
sigma_z = 0.23

In [4]:
machine = SPS(n_segments=1, machine_configuration='Q20-injection')

Synchrotron init. From kwargs: machine_configuration = 'Q20-injection'
Synchrotron init. From kwargs: n_segments = 1


In [5]:
beam = machine.generate_6D_Gaussian_bunch_matched(
    n_macroparticles=n_particles, intensity=intensity, 
    epsn_x=epsn_x, epsn_y=epsn_y, sigma_z=sigma_z
)

*** Maximum RMS bunch length 0.235788291837m.
... distance to target bunch length: -2.4290e-02
... distance to target bunch length: -2.4255e-02
... distance to target bunch length: -9.4232e-03
... distance to target bunch length: -5.0846e-03
... distance to target bunch length: -2.0166e-03
... distance to target bunch length: -6.6153e-04
... distance to target bunch length: -1.2571e-04
... distance to target bunch length: -9.7555e-06
... distance to target bunch length: -1.5712e-07
... distance to target bunch length: -1.9993e-10
--> Bunch length: 0.23
--> Emittance: 0.407764779947


In [6]:
offset_part = 0.01
x0 = gpuarray.min(beam.x).get()
y0 = gpuarray.min(beam.y).get()
z0 = gpuarray.min(beam.z).get()
offset_x = offset_part * np.abs(x0)
offset_y = offset_part * np.abs(y0)
offset_z = offset_part * np.abs(z0)
x0 -= offset_x
y0 -= offset_y
z0 -= offset_z
dx = (gpuarray.max(beam.x).get() - x0 + 2*offset_x) / (nx - 3)
dy = (gpuarray.max(beam.y).get() - y0 + 2*offset_y) / (ny - 3)
dz = (gpuarray.max(beam.z).get() - z0 + 2*offset_z) / (nz - 3)

# 1 node empty around the mesh for boundary conditions
x0 -= dx
y0 -= dy
z0 -= dz

mesh = meshing.RectMesh3D(x0, y0, z0, dx, dy, dz, nx, ny, nz, mathlib=cumath)

In [7]:
# solver = FD_solver.GPUFiniteDifferencePoissonSolver(mesh, context, FD_solver.laplacian_3D_7stencil)
solver = FFT_solver.GPUFFTPoissonSolver(mesh)

In [8]:
pypicalg = pypic.PyPIC_GPU(mesh, solver, context)

In [9]:
%%timeit
e_x, e_y, e_z = pypicalg.pic_solve(beam.x, beam.y, beam.z)

The slowest run took 9.56 times longer than the fastest. This could mean that an intermediate result is being cached 
1 loops, best of 3: 180 ms per loop


In [10]:
mod = gpu.thrust_interface.compiled_module

In [11]:
# %%timeit -n 1
# idx = gpuarray.zeros(n_particles, dtype=np.int32)
# mod.get_sort_perm_int(mesh.get_node_ids(beam.x, beam.y, beam.z), idx)
# beam.reorder(idx)

In [12]:
# %%timeit
# e_x, e_y, e_z = pypicalg.pic_solve(beam.x, beam.y, beam.z)

# Timing sorting solution

In [13]:
%%timeit
rho = pypicalg.particles_to_mesh(beam.x, beam.y, beam.z, charge=1)
context.synchronize()

10 loops, best of 3: 91.7 ms per loop


In [14]:
mesh_charges_atomicadd = pypicalg.particles_to_mesh(beam.x, beam.y, beam.z)
phi = pypicalg.poisson_solve(mesh_charges_atomicadd)
mesh_e_fields = pypicalg.get_electric_fields(phi)
context.synchronize()

In [15]:
%%timeit
pypicalg.field_to_particles(*zip(list(mesh_e_fields), [beam.x, beam.y, beam.z]))

10 loops, best of 3: 49.6 ms per loop


In [16]:
mesh_charges_atomicadd = pypicalg.particles_to_mesh(beam.x, beam.y, beam.z)
# mesh_charges_atomicadd = mesh_charges_atomicadd.reshape((1, 1, mesh.n_nodes))[0,0]

In [17]:
get_sort_perm_int = mod.get_sort_perm_int
lower_bound_int = mod.lower_bound_int
upper_bound_int = mod.upper_bound_int

In [24]:
%%timeit
global mesh_charges, lower_bounds, upper_bounds
idx = gpuarray.zeros(n_particles, dtype=np.int32)
get_sort_perm_int(mesh.get_node_ids(beam.x, beam.y, beam.z), idx)
beam.reorder(idx)
node_ids = mesh.get_node_ids(beam.x, beam.y, beam.z)
lower_bounds = gpuarray.empty(mesh.n_nodes, dtype=np.int32)
upper_bounds = gpuarray.empty(mesh.n_nodes, dtype=np.int32)
seq = gpuarray.arange(mesh.n_nodes, dtype=np.int32)
lower_bound_int(node_ids, seq, lower_bounds)
upper_bound_int(node_ids, seq, upper_bounds)
mesh_charges = pypicalg.sorted_particles_to_mesh(
    beam.x, beam.y, beam.z, 
    lower_bounds=lower_bounds, upper_bounds=upper_bounds
)
context.synchronize()

10 loops, best of 3: 26.8 ms per loop


compare this to ~90.7 ms from atomicAdd version

In [19]:
np.allclose(mesh_charges.get(), mesh_charges_atomicadd.get())

True

In [20]:
90.7/25.8

3.5155038759689923

In [21]:
%%timeit
pypicalg.field_to_particles(*zip(list(mesh_e_fields), [beam.x, beam.y, beam.z]))

10 loops, best of 3: 39 ms per loop


compare this to ~48.7 ms from atomicAdd version

In [22]:
48.7/38.9

1.2519280205655527

# Timing full pic_solve

~180 ms for full pic_solve using FFT (see above)

In [25]:
%%timeit
e_x, e_y, e_z = pypicalg.pic_solve(beam.x, beam.y, beam.z, lower_bounds=lower_bounds, upper_bounds=upper_bounds)

10 loops, best of 3: 87.1 ms per loop
