In [None]:
import pycuda.driver as driver
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""
  __global__ void hello_thread()
  {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello from thread %d\n", tid);
  }

""")

func = mod.get_function("hello_thread")
func(block=(4,1,1), grid=(2,1))
drv.Context.synchronize()

ModuleNotFoundError: No module named 'pycuda'

In [None]:
import cupy as cp

# Compile CUDA source into a RawModule
mod = cp.RawModule(code=r'''
extern "C" __global__
void hello_kernel() {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello from thread %d\n", tid);
}
''')

# Get the kernel
hello_kernel = mod.get_function("hello_kernel")

# Launch with grid=2, block=4
hello_kernel((2,), (4,), ())   # note the empty args tuple
cp.cuda.runtime.deviceSynchronize()  # flush printf output

In [None]:
import pycuda.driver as driver
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""
  __global__ void hello_thread()
  {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello from thread %d\n", tid);
  }

""")

func = mod.get_function("hello_thread")
func(block=(4,1,1), grid=(2,1))

CompileError: nvcc compilation of /tmp/tmp897lg2qb/kernel.cu failed
[command: nvcc --cubin -arch sm_75 -I/usr/local/lib/python3.12/dist-packages/pycuda/cuda kernel.cu]
[stderr:
kernel.cu:6:12: warning: missing terminating " character
    6 |     printf("Hello from thread %d
      |            ^
kernel.cu:7:1: warning: missing terminating " character
    7 | ", tid);
      | ^
kernel.cu(6): error: missing closing quote
      printf("Hello from thread %d
             ^

kernel.cu(7): error: missing closing quote
  ", tid);
  ^

kernel.cu(8): error: expected a ")"
    }
    ^

kernel.cu(8): error: expected a ";"
    }
    ^

kernel.cu(5): warning #177-D: variable "tid" was declared but never referenced
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
          ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

4 errors detected in the compilation of "kernel.cu".
]

In [None]:
import cupy as cp

# Define CUDA kernel
mod = cp.RawModule(code=r'''
extern "C" __global__
void hello_kernel() {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello from thread %d\n", tid);
}
''')

# Get the kernel from the module
hello_kernel = mod.get_function("hello_kernel")

# Launch kernel with 2 blocks of 4 threads = 8 threads total
hello_kernel((2,), (4,), ())   # grid=(2,), block=(4,), no args
cp.cuda.runtime.deviceSynchronize()  # flush printf output

In [None]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.2.4-py3-none-any.whl.metadata (2.9 kB)
Collecting siphash24>=1.6 (from pytools>=2011.2->pycuda)
  Downloading siphash24-1.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pytools-2025.2.4-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.4/99.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading siphash24-1.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import numpy as np

mod = SourceModule("""
__global__ void hello_thread(int *out)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    out[tid] = tid;
}
""")

hello_thread = mod.get_function("hello_thread")

nthreads = 8
out_gpu = drv.mem_alloc(nthreads * np.int32().nbytes)

hello_thread(out_gpu, block=(4,1,1), grid=(2,1,1))

out_host = np.empty(nthreads, dtype=np.int32)
drv.memcpy_dtoh(out_host, out_gpu)

for tid in out_host:
    print(f"Hello from thread {tid}")


Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7


In [None]:
import cupy as cp
import time

n = 10_000_000
a = cp.arange(n, dtype=cp.float32)
b = cp.arange(n, dtype=cp.float32)

start = time.time()
c = a + b
cp.cuda.runtime.deviceSynchronize()
end = time.time()

print("Time taken:", end - start, "seconds")

First 5 results: [0. 2. 4. 6. 8.]
Time taken: 0.0006968975067138672 seconds


In [None]:
import time

n = 10_000_000

a = list(range(n))
b = list(range(n))

start = time.time()

for i in range(n):
  c = [a[i] + b[i]]
end = time.time()

print("Time taken:", end - start, "seconds")

Time taken: 1.906116008758545 seconds


In [None]:
import cupy as cp
import numpy as np
import time

image_matrix = np.random.randint(0, 256, (4096, 4096), dtype=np.uint8)

d_image = cp.array(image_matrix)

start = time.time()

d_image = 255 - d_image

cp.cuda.runtime.deviceSynchronize()
end = time.time()

inverted = cp.asnumpy(d_image)

print("Original:\n", image_matrix)
print("Inverted:\n", inverted)
print("Execution time:", end - start, "seconds")

Original:
 [[ 45 126  19 ...  90   8 249]
 [ 38 167  53 ...  22  85 230]
 [200 131 247 ... 155 227 163]
 ...
 [125  87 236 ...  13 234 171]
 [176  84  82 ... 119 165 237]
 [ 57  14 114 ...  59  42   1]]
Inverted:
 [[210 129 236 ... 165 247   6]
 [217  88 202 ... 233 170  25]
 [ 55 124   8 ... 100  28  92]
 ...
 [130 168  19 ... 242  21  84]
 [ 79 171 173 ... 136  90  18]
 [198 241 141 ... 196 213 254]]
Execution time: 0.0022580623626708984 seconds


In [None]:
import numpy as np
import time

# Create random image
image_matrix = np.random.randint(0, 256, (4096, 4096), dtype=np.uint8)

# Measure time
start = time.time()

# Invert image (elementwise)
inverted = 255 - image_matrix

end = time.time()

print("Original:\n", image_matrix)
print("Inverted:\n", inverted)
print("Execution time:", end - start, "seconds")

Original:
 [[ 75 167  98 ... 114 239 161]
 [239  33 134 ... 100 172  72]
 [200 149  72 ... 150  49 145]
 ...
 [ 19 180 223 ...  89 123  96]
 [135  83 153 ... 234 205  31]
 [120  20 208 ... 161 236 109]]
Inverted:
 [[180  88 157 ... 141  16  94]
 [ 16 222 121 ... 155  83 183]
 [ 55 106 183 ... 105 206 110]
 ...
 [236  75  32 ... 166 132 159]
 [120 172 102 ...  21  50 224]
 [135 235  47 ...  94  19 146]]
Execution time: 0.004038572311401367 seconds
