# Submitted By: Vishav Gupta (102497018)

1. identify !, %, %% used in cell in Google Colab

In [2]:
# ! is used to enter shell commands

!whoami

root


In [3]:
# % is line magic to run a single magic line

%time my_list = [x for x in range(1000000)]

CPU times: user 18 ms, sys: 16.8 ms, total: 34.7 ms
Wall time: 34.9 ms


In [4]:
# %% is cell magic to transform a whole cell into magic cell

%%writefile demo.txt
This is line 1.
This is line 2.
We are saving this text into a file named demo.txt!

Writing demo.txt




---



2. Identify all key nvidia-smi commands with multiple options

In [5]:
!nvidia-smi

Sun Feb  8 18:36:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   53C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
!nvidia-smi --version

NVIDIA-SMI version  : 550.54.15
NVML version        : 550.54
DRIVER version      : 550.54.15
CUDA Version        : 12.4


In [7]:
!nvidia-smi --list-gpus

GPU 0: Tesla T4 (UUID: GPU-bef24976-2525-285f-3717-e615094bbb84)




---



3. Debug common CUDA errors (zero output, incorrect indexing, PTX errors)



---



4. Write a CUDA C/C++ program to demonstrate GPU kernel execution and thread indexing.

a. Launch a CUDA kernel using: 1 block and 8 threads

b. Each thread must print: Hello from GPU thread <global_thread_id>

c. Compute the global thread ID using: $globalThread_{id} = blockIdx.x * blockDim.x +
threadIdx.x$

d. Clearly separate: Host code (CPU) & Device code (GPU kernel)


In [8]:
%%writefile q4.cu
#include <stdio.h>
#include <cuda.h>

// Device Code
__global__ void hello_kernel() {
    // Computing global threads
    int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    //Each thread prints "Hello from GPU thread"
    printf("Hello from GPU thread %d\n", global_thread_id);
}

//Host Code
int main() {
    printf("Host: Launching Kernel...\n");
    //launching cuda kernel of 1 block and 8 threads
    hello_kernel<<<1, 8>>>();
    cudaDeviceSynchronize();
    printf("Host: Kernel finished successfully.\n");
    return 0;
}

Writing q4.cu


In [9]:
!nvcc -arch=sm_75 q4.cu -o q4

!./q4

Host: Launching Kernel...
Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from GPU thread 5
Hello from GPU thread 6
Hello from GPU thread 7
Host: Kernel finished successfully.




---



5. Write a CUDA program to demonstrate host and device memory separation.

a. Create an integer array of size 5 on the host (CPU).

b. Allocate corresponding memory on the device (GPU) using cudaMalloc().

c. Copy data from host to device using cudaMemcpy().

d. Launch a kernel where GPU threads print values from device memory.

e. Copy the data back from device to host and print it on CPU.

In [10]:
%%writefile q5_memory.cu
#include <stdio.h>
#include <cuda.h>

__global__ void print_device_memory(int *d_arr, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx < n) {
        printf("GPU (Device): Thread %d sees value %d\n", idx, d_arr[idx]);
    }
}

int main() {
    int n = 5;
    size_t size = n * sizeof(int);

    // [a] Create Host Array
    int h_arr[5] = {10, 20, 30, 40, 50};
    int h_received[5]; // Buffer to store data copied back
    int *d_arr;        // Pointer for device memory

    printf("CPU (Host): Original array: {10, 20, 30, 40, 50}\n");

    // [b] Allocate Device Memory
    cudaMalloc((void**)&d_arr, size);

    // [c] Copy Data: Host -> Device
    cudaMemcpy(d_arr, h_arr, size, cudaMemcpyHostToDevice);

    // [d] Launch Kernel (1 Block, 5 Threads)
    print_device_memory<<<1, 5>>>(d_arr, n);

    // Check for launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("Launch Error: %s\n", cudaGetErrorString(err));

    // Force CPU to wait for GPU prints
    cudaDeviceSynchronize();

    // [e] Copy Data: Device -> Host
    cudaMemcpy(h_received, d_arr, size, cudaMemcpyDeviceToHost);

    // Print data received back on CPU
    printf("CPU (Host): Data copied back from GPU: ");
    for(int i = 0; i < n; i++) {
        printf("%d ", h_received[i]);
    }
    printf("\n");

    // Cleanup
    cudaFree(d_arr);
    return 0;
}

Overwriting q5_memory.cu


In [11]:
!nvcc -arch=sm_75 q5_memory.cu -o q5_memory

!./q5_memory

CPU (Host): Original array: {10, 20, 30, 40, 50}
GPU (Device): Thread 0 sees value 10
GPU (Device): Thread 1 sees value 20
GPU (Device): Thread 2 sees value 30
GPU (Device): Thread 3 sees value 40
GPU (Device): Thread 4 sees value 50
CPU (Host): Data copied back from GPU: 10 20 30 40 50 




---



6. Compare CPU times of List/tuple with Numpy arrays.

In [13]:
import time
import numpy as np

size = 10**6

list1 = list(range(size))
list2 = list(range(size))

start_time = time.time()
result_list = [x + y for x, y in zip(list1, list2)]
list_duration = time.time() - start_time

tuple1 = tuple(range(size))
tuple2 = tuple(range(size))

start_time = time.time()
result_tuple = tuple(x + y for x, y in zip(tuple1, tuple2))
tuple_duration = time.time() - start_time

array1 = np.arange(size)
array2 = np.arange(size)

start_time = time.time()
result_array = array1 + array2
numpy_duration = time.time() - start_time

print(f"List time:  {list_duration:.5f} seconds")
print(f"Tuple time: {tuple_duration:.5f} seconds")
print(f"NumPy time: {numpy_duration:.5f} seconds")
print(f"NumPy is {list_duration / numpy_duration:.1f}x faster than List.")

List time:  0.06576 seconds
Tuple time: 0.08258 seconds
NumPy time: 0.00303 seconds
NumPy is 21.7x faster than List.
