## Executed commands
1. cd $SCRATCH
2. module load cuda/12.1.1
3. conda create --prefix $SCRATCH/a1 python=3.11 -y
4. conda activate a1/
5. conda install numpy
6. conda install -c conda-forge cupy
7. conda install -c conda-forge jupyter
8. python -m ipykernel install --user --name=a1 --display-name "cuda-a1"


In [1]:
import tensorflow as tf
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Flatten

# Load and preprocess the data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(physical_devices)

# Define the model
model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

ModuleNotFoundError: No module named 'tensorflow'

In [7]:
import numpy as np
import cupy as cp
import time

# Matrix size
N = 10000

def run_experiment(N):
    # CPU-based matrix multiplication using NumPy
    start_time = time.time()
    A_cpu = np.random.rand(N, N)
    B_cpu = np.random.rand(N, N)
    C_cpu = np.dot(A_cpu, B_cpu)
    print(C_cpu[1][1])
    cpu_time = time.time() - start_time
    print(f"CPU time: {cpu_time:.2f} seconds")

    # GPU-based matrix multiplication using CuPy
    start_time = time.time()
    A_gpu = cp.random.rand(N, N, dtype=cp.float32)
    B_gpu = cp.random.rand(N, N, dtype=cp.float32)
    C_gpu = cp.dot(A_gpu, B_gpu)
    print(C_gpu[1][1])
    gpu_time = time.time() - start_time
    print(f"GPU time: {gpu_time:.2f} seconds")

run_experiment(10000)

2489.7211369835995
CPU time: 30.31 seconds
2502.0647
GPU time: 0.16 seconds


2475.867684654416
CPU time: 30.47 seconds
2500.9556
GPU time: 2.66 seconds

2506.9204726489556
CPU time: 29.92 seconds
2488.008
GPU time: 0.16 seconds

2477.12437561877
CPU time: 30.30 seconds
2495.3054
GPU time: 0.15 seconds

2522.5097704869895
CPU time: 30.17 seconds
2491.4421
GPU time: 0.16 seconds

2493.952022696899
CPU time: 30.20 seconds
2492.9202
GPU time: 0.16 seconds

In [8]:
N_sizes = [1000, 3000, 5000, 10000, 20000]

for matrix_size in N_sizes:
    print("---------------")
    print(f"matrix_size: {matrix_size}")
    for i in range(3):
        run_experiment(matrix_size)
        print()

---------------
matrix_size: 1000
254.5997635655197
CPU time: 0.05 seconds
238.40604
GPU time: 0.00 seconds

249.3503072999922
CPU time: 0.05 seconds
240.22192
GPU time: 0.00 seconds

248.10468720154867
CPU time: 0.05 seconds
231.59483
GPU time: 0.00 seconds

---------------
matrix_size: 3000
756.2997635100163
CPU time: 0.95 seconds
734.2489
GPU time: 0.01 seconds

759.9756431413139
CPU time: 0.94 seconds
748.5583
GPU time: 0.00 seconds

747.5341773560866
CPU time: 0.94 seconds
748.78424
GPU time: 0.01 seconds

---------------
matrix_size: 5000
1259.1085296551275
CPU time: 4.10 seconds
1214.0638
GPU time: 0.02 seconds

1241.2225234763455
CPU time: 4.09 seconds
1256.4404
GPU time: 0.02 seconds

1259.3081245589249
CPU time: 4.10 seconds
1273.2544
GPU time: 0.02 seconds

---------------
matrix_size: 10000
2519.5522328586467
CPU time: 30.29 seconds
2478.8318
GPU time: 0.16 seconds

2526.6076206243533
CPU time: 30.30 seconds
2507.1755
GPU time: 0.16 seconds

2445.6647272436644
CPU time: 30.

: 

: 

: 

### For Small Matrices (N=1000)

The CPU might be nearly as fast as the GPU because the data transfer overhead to the GPU can dominate.

GPU acceleration might not be significant at this scale.

### For Medium Matrices (N=3000,5000)

The GPU should start showing significant speedup as matrix size increases.

CPU time will increase significantly due to higher computational complexity O(N^3)

### For Large Matrices (N=10000,20000)

The CPU execution time will become very large. Avg time 30.295 when previously it was lower than 5

The GPU will show a dramatic speedup due to parallel processing. GPU time eqeals 0.16 which is very fast in this situation!

GPU memory limitations might occur at very high values of N, requiring batch processing or optimized memory management. The kernel crashed when N was set to 20000

Chat GPT conversation link: https://chatgpt.com/share/67d721c2-28e0-8001-8b87-b5f647c7be5c