<a href="https://colab.research.google.com/github/Rontim/GPU-Parallel-Processing-AI/blob/main/gpu_programming/gpu_detect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

# ==========================================
# 🧠 Phase 1: GPU Detection & Benchmarking
# ==========================================

In [1]:
import numpy as np
import cupy as cp
import time

In [8]:
def detect_gpu():
    num_gpus = cp.cuda.runtime.getDeviceCount()
    print(f"\nNumber of CUDA-enabled GPUs detected: {num_gpus}")
    for i in range(num_gpus):
        props = cp.cuda.runtime.getDeviceProperties(i)
        print(f"\nGPU {i}: {props['name'].decode()}")
        for key, val in props.items():
            if isinstance(val, bytes):
              try:
                val = val.decode()
              except UnicodeDecodeError:
                val = val.decode('latin-1')
              except:
                continue
            print(f"{key}: {val}")

In [9]:
# Detect GPU
detect_gpu()


Number of CUDA-enabled GPUs detected: 1

GPU 0: Tesla T4
name: Tesla T4
totalGlobalMem: 15828320256
sharedMemPerBlock: 49152
regsPerBlock: 65536
warpSize: 32
maxThreadsPerBlock: 1024
maxThreadsDim: (1024, 1024, 64)
maxGridSize: (2147483647, 65535, 65535)
clockRate: 1590000
totalConstMem: 65536
major: 7
minor: 5
textureAlignment: 512
texturePitchAlignment: 32
multiProcessorCount: 40
kernelExecTimeoutEnabled: 0
integrated: 0
canMapHostMemory: 1
computeMode: 0
maxTexture1D: 131072
maxTexture2D: (131072, 65536)
maxTexture3D: (16384, 16384, 16384)
concurrentKernels: 1
ECCEnabled: 1
pciBusID: 0
pciDeviceID: 4
pciDomainID: 0
tccDriver: 0
memoryClockRate: 5001000
memoryBusWidth: 256
l2CacheSize: 4194304
maxThreadsPerMultiProcessor: 1024
isMultiGpuBoard: 0
cooperativeLaunch: 1
cooperativeMultiDeviceLaunch: 1
deviceOverlap: 1
maxTexture1DMipmap: 32768
maxTexture1DLinear: 268435456
maxTexture1DLayered: (32768, 2048)
maxTexture2DMipmap: (32768, 32768)
maxTexture2DLinear: (131072, 65000, 2097120)


# --------------------------------------------
# 🧪 CPU vs GPU Benchmark
# --------------------------------------------

In [18]:
def benchmark_matrix_multiplication(n=8000):
    print("\n⚙️ Running matrix multiplication of size", n)

    # CPU version (NumPy)
    A_cpu = np.random.rand(n, n)
    B_cpu = np.random.rand(n, n)

    start_cpu = time.time()
    C_cpu = A_cpu @ B_cpu
    cpu_time = time.time() - start_cpu
    print(f"🧠 CPU (NumPy) time: {cpu_time:.4f} seconds")

    # GPU version (CuPy)
    A_gpu = cp.asarray(A_cpu)
    B_gpu = cp.asarray(B_cpu)

    cp.cuda.Device(0).synchronize()
    start_gpu = time.time()
    C_gpu = A_gpu @ B_gpu
    cp.cuda.Device(0).synchronize()
    gpu_time = time.time() - start_gpu
    print(f"⚡ GPU (CuPy) time: {gpu_time:.4f} seconds")

    # Accuracy check
    diff = np.abs(cp.asnumpy(C_gpu) - C_cpu).max()
    print(f"🔬 Max difference between CPU and GPU results: {diff}")


In [19]:
benchmark_matrix_multiplication()


⚙️ Running matrix multiplication of size 8000
🧠 CPU (NumPy) time: 18.0746 seconds
⚡ GPU (CuPy) time: 4.1856 seconds
🔬 Max difference between CPU and GPU results: 2.6147972675971687e-11
