In [2]:
from PIL import Image
import struct
import ctypes

img = Image.open('shrek_krupnym_planom.png')
(w, h) = img.size[0:2]
pix = img.load()
buff = ctypes.create_string_buffer(4 * w * h)
offset = 0
for j in range(h):
	for i in range(w):
		r = bytes((pix[i, j][0],))
		g = bytes((pix[i, j][1],))
		b = bytes((pix[i, j][2],))
		a = bytes((255,))
		struct.pack_into('cccc', buff, offset, r, g, b, a)
		offset += 4
out = open('in.data', 'wb')
out.write(struct.pack('ii', w, h))
out.write(buff.raw)
out.close()

In [1]:
%%writefile image.cu
#include <stdio.h>
#include <stdlib.h>

#define CSC(call)  									                \
do {											                    \
	cudaError_t res = call;							                \
	if (res != cudaSuccess) {							            \
		fprintf(stderr, "ERROR in %s:%d. Message: %s\n",			\
				__FILE__, __LINE__, cudaGetErrorString(res));		\
		exit(0);								                    \
	}										                        \
} while(0)

__global__ void kernel(cudaTextureObject_t tex, uchar4 *out, int w, int h) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
	int idy = blockDim.y * blockIdx.y + threadIdx.y;
   	int offsetx = blockDim.x * gridDim.x;
	int offsety = blockDim.y * gridDim.y;
    int x, y;
    uchar4 p;
    for(y = idy; y < h; y += offsety)
		for(x = idx; x < w; x += offsetx) {
            p = tex2D< uchar4 >(tex, 3.0 * x / w, 4.0 * y / h);
            out[y * w + x] = make_uchar4(255 - p.x, 255 - p.y, 255 - p.z, p.w);
        }
}

int main() {
    int w, h;
   	FILE *fp = fopen("in.data", "rb");
 	fread(&w, sizeof(int), 1, fp);
	fread(&h, sizeof(int), 1, fp);
    uchar4 *data = (uchar4 *)malloc(sizeof(uchar4) * w * h);
    fread(data, sizeof(uchar4), w * h, fp);
    fclose(fp);

    cudaArray *arr;
    cudaChannelFormatDesc ch = cudaCreateChannelDesc<uchar4>();
    CSC(cudaMallocArray(&arr, &ch, w, h));
    CSC(cudaMemcpy2DToArray(arr, 0, 0, data, w * sizeof(uchar4), w * sizeof(uchar4), h, cudaMemcpyHostToDevice));

    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = arr;

    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = cudaAddressModeWrap;
    texDesc.addressMode[1] = cudaAddressModeMirror; // Clamp
    texDesc.filterMode = cudaFilterModePoint;
    texDesc.readMode = cudaReadModeElementType;
    texDesc.normalizedCoords = true;

    cudaTextureObject_t tex = 0;
    CSC(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));

    uchar4 *dev_out;
	CSC(cudaMalloc(&dev_out, sizeof(uchar4) * w * h));

    kernel<<< dim3(16, 16), dim3(32, 32) >>>(tex, dev_out, w, h);
    CSC(cudaGetLastError());

    CSC(cudaMemcpy(data, dev_out, sizeof(uchar4) * w * h, cudaMemcpyDeviceToHost));

	CSC(cudaDestroyTextureObject(tex));
	CSC(cudaFreeArray(arr));
	CSC(cudaFree(dev_out));

    fp = fopen("out.data", "wb");
	fwrite(&w, sizeof(int), 1, fp);
	fwrite(&h, sizeof(int), 1, fp);
	fwrite(data, sizeof(uchar4), w * h, fp);
	fclose(fp);

    free(data);
    return 0;
}

Writing image.cu


In [14]:
%%shell
nvcc -arch=sm_75 image.cu
./a.out



In [3]:
from PIL import Image
import struct
import ctypes

fin = open('out.data', 'rb')
(w, h) = struct.unpack('ii', fin.read(8))
buff = ctypes.create_string_buffer(4 * w * h)
fin.readinto(buff)
fin.close()
img = Image.new('RGBA', (w, h))
pix = img.load()
offset = 0
for j in range(h):
	for i in range(w):
		(r, g, b, a) = struct.unpack_from('cccc', buff, offset)
		pix[i, j] = (ord(r), ord(g), ord(b), ord(a))
		offset += 4
img.save('out.png')

# debug cuda

**python with numpy and shrek**