A person in the lab had their compute job die on a signal while using the new Theano backend, libgpuarray, Git master branch tip as of today (SHA-1 23c8f004968f2d7703612f2ebb0e2da1b112bc7d). I recognized the signal code as SIGSEGV asked and was provided a truncated-at-3MB core file and libgpuarray.so.2.0 file.
rax 0x0 0
rbx 0x7ffdc3e10310 140727889756944
rcx 0x997cc9c0 2575092160
rdx 0x1 1
rsi 0x1 1
rdi 0x0 0
rbp 0x0 0x0
rsp 0x7ffdc3e101b0 0x7ffdc3e101b0
r8 0xc01ec9 12590793
r9 0x997cc970 2575092080
r10 0xc01ec8 12590792
r11 0x1 1
r12 0x0 0
r13 0x0 0
r14 0x0 0
r15 0x99d65de0 2580962784
rip 0x7f6fbdb11187 0x7f6fbdb11187
eflags 0x10206 [ PF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
st0 0 (raw 0x00000000000000000000)
st1 0 (raw 0x00000000000000000000)
st2 0 (raw 0x00000000000000000000)
st3 0 (raw 0x00000000000000000000)
st4 0 (raw 0x00000000000000000000)
st5 0.010000000000000000208166817117216851 (raw 0x3ff8a3d70a3d70a3d800)
st6 0.010000000000000000208166817117216851 (raw 0x3ff8a3d70a3d70a3d800)
st7 0.010000000000000000208166817117216851 (raw 0x3ff8a3d70a3d70a3d800)
fctrl 0x37f 895
fstat 0x0 0
ftag 0xffff 65535
fiseg 0x7f6f 32623
fioff 0xd02674d8 -802786088
foseg 0x7ffd 32765
fooff 0xc3e4e388 -1008409720
fop 0x0 0
mxcsr 0x1fa2 [ DE PE IM DM ZM OM UM PM ]
ymm0 {v8_float = {0x4, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x2, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x80, 0x40, 0x0, 0x0, 0x0, 0x8, 0x80, 0x7, 0x0, 0x5b, 0x0, 0x20, 0x0, 0x40, 0x0 <repeats 16 times>}, v16_int16 = {0x0, 0x4080, 0x0, 0x800, 0x780, 0x5b00, 0x2000, 0x4000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x40800000, 0x8000000, 0x5b000780, 0x40002000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x800000040800000, 0x400020005b000780, 0x0, 0x0}, v2_int128 = {0x400020005b0007800800000040800000, 0x00000000000000000000000000000000}}
ymm1 {v8_float = {0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x4, 0x0, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc, 0x0 <repeats 17 times>}, v16_int16 = {0x0, 0x0, 0x0, 0x4010, 0x0, 0x0, 0x0, 0xc, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x0, 0x40100000, 0x0, 0xc0000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x4010000000000000, 0xc000000000000, 0x0, 0x0}, v2_int128 = {0x000c0000000000004010000000000000, 0x00000000000000000000000000000000}}
ymm2 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm3 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x13, 0x0, 0x80, 0x4, 0x0 <repeats 28 times>}, v16_int16 = {0x13, 0x480, 0x0 <repeats 14 times>}, v8_int32 = {0x4800013, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x4800013, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000004800013, 0x00000000000000000000000000000000}}
ymm4 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x90, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x30, 0x0 <repeats 23 times>}, v16_int16 = {0x290, 0x0, 0x0, 0x0, 0x30, 0x0 <repeats 11 times>}, v8_int32 = {0x290, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x290, 0x30, 0x0, 0x0}, v2_int128 = {0x00000000000000300000000000000290, 0x00000000000000000000000000000000}}
ymm5 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x5c, 0xb, 0x13, 0x0, 0x0, 0x0, 0x40, 0xe, 0x4c, 0xb, 0x13, 0x0 <repeats 19 times>}, v16_int16 = {0x0, 0xb5c, 0x13, 0x0, 0xe40, 0xb4c, 0x13, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0xb5c0000, 0x13, 0xb4c0e40, 0x13, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x130b5c0000, 0x130b4c0e40, 0x0, 0x0}, v2_int128 = {0x000000130b4c0e40000000130b5c0000, 0x00000000000000000000000000000000}}
ymm6 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm7 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm8 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm9 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm10 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm11 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm12 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0xff, 0x0 <repeats 22 times>}, v16_int16 = {0x0, 0x0, 0xff00, 0x0, 0xff00, 0x0 <repeats 11 times>}, v8_int32 = {0x0, 0xff00, 0xff00, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xff0000000000, 0xff00, 0x0, 0x0}, v2_int128 = {0x000000000000ff000000ff0000000000, 0x00000000000000000000000000000000}}
ymm13 {v8_float = {0x0, 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0xfb, 0xb4, 0x35, 0x3b, 0x86, 0x10, 0xae, 0xbf, 0x0 <repeats 24 times>}, v16_int16 = {0xb4fb, 0x3b35, 0x1086, 0xbfae, 0x0 <repeats 12 times>}, v8_int32 = {0x3b35b4fb, 0xbfae1086, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xbfae10863b35b4fb, 0x0, 0x0, 0x0}, v2_int128 = {0x0000000000000000bfae10863b35b4fb, 0x00000000000000000000000000000000}}
ymm14 {v8_float = {0x0, 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0xfb, 0xb4, 0x35, 0x3b, 0x86, 0x10, 0xae, 0xbf, 0x0 <repeats 24 times>}, v16_int16 = {0xb4fb, 0x3b35, 0x1086, 0xbfae, 0x0 <repeats 12 times>}, v8_int32 = {0x3b35b4fb, 0xbfae1086, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xbfae10863b35b4fb, 0x0, 0x0, 0x0}, v2_int128 = {0x0000000000000000bfae10863b35b4fb, 0x00000000000000000000000000000000}}
ymm15 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x54, 0xdc, 0x1c, 0x1a, 0xe1, 0x6b, 0x5f, 0xbc, 0x0 <repeats 24 times>}, v16_int16 = {0xdc54, 0x1a1c, 0x6be1, 0xbc5f, 0x0 <repeats 12 times>}, v8_int32 = {0x1a1cdc54, 0xbc5f6be1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xbc5f6be11a1cdc54, 0x0, 0x0, 0x0}, v2_int128 = {0x0000000000000000bc5f6be11a1cdc54, 0x00000000000000000000000000000000}}
A person in the lab had their compute job die on a signal while using the new Theano backend,
libgpuarray, Gitmasterbranch tip as of today (SHA-123c8f004968f2d7703612f2ebb0e2da1b112bc7d). I recognized the signal code asSIGSEGVasked and was provided a truncated-at-3MBcorefile andlibgpuarray.so.2.0file.After some reverse engineering, I found the problem to originate in
GpuArray_reshape_inplace(), src/gpuarray_array.c:665Through some more reverse engineering I've been able to establish the following:
imul (%rcx,%r10,8),%rdi0x9f7dc000. This is the beginning of a new page, likely in heap.It was probably not mapped. Due to ASLR and the truncation of the core file I can't pursue this further.
a->dimensions == 0x997cc9c0, inrcx.newdims == 0x7ffdc3e10310, inrbxoj == 12590793inr8d. It had been incremented by the instruction preceding the load that segfaulted. The segfaulting access was made with a copy ofojinr10d, made whenojheld the value12590792.op == 0, inrdi.nj == 1, inesinp == 1, inrdxesi, and thusnj, on entry to thewhile (np != op)loop was probably the same as that inr11d, which currently holds1.rdx, and thusnp, on entry to thewhile (np != op)loop was probably the same as that innewdims[r13], andr13held at the time of the crash the value0. The registerr13is not accessed within the loop.r13d's last value was itself copied fromebp.ebpis also not accessed within thewhile (np != op)loop.rdi, and thusnp, on entry to thewhile (np != op)loop was probably the same as that ina->dimensions[r12], andr12held at the time of the crash the value0. The registerr12is not accessed within thewhile (np != op)loop.r12d's last value was itself copied fromeax.eaxis also not accessed within thewhile (np != op)loop.r9probably holds the valuea->strides. This points 80 bytes behinda->dimensions, so we might be able to compute an upper bound on the dimensions ofaby taking into account internal GNUptmallocdata structure sizes.r14was probably set to 8x the value ofr12before entry to the loop. Sincer12is0, so isr14.GpuArray_reshape_inplace(),r15is used most commonly to holdmalloc()'ed pointers. At the moment of the crash it held0x99d65de0, which does look like a heap address.Opinion
I think that somehow,
opturned zero, whether by corrupt input, integer overflow or logic error. From that moment on the function was doomed and kept on multiplying all memory contents together until it hit the end of the heap and died.The loop that segfaulted in
GpuArray_reshape_inplace()does not defend itself against this possibility, and more generallyGpuArray_reshape_inplace()is spaghetti code. It should be refactored.Appendix
The contents of the registers at the moment of the crash were: