# 1 Basic Vectorization

Vectorization refers to processing multiple compute-intensive steps in parallel. This is done by formulating by computation steps in terms of vectors and using efficient math like dot-product to speed up calculation

In [2]:
from time import perf_counter
import numpy as np # version = 1.24.3
import torch # version = 2.0.0 + cu118

### Comparing speedup for loop-based vs matmul based dot-product for 1D array
- Two random 1D torch FloatTensors a, b are created each of size 10, 100, 1000, 10000, 100000
- Time required to obtain dot product using loop is calculated as mean of 1000 runs to reduce variance
- The same procedure is done for torch.matmul operation -> a @ b.T
- Speedup is represented as multiples: Ratio of ```time_taken_for_loop / time_taken_for_matmul```

In [5]:
SIZES_OF_ARRAY = [10, 100, 1000, 10000, 100000]

for size_of_array in SIZES_OF_ARRAY:
    a = torch.rand(size_of_array)
    b = torch.rand(size_of_array)

    avg_loop_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        dot_loop_value = 0.0
        t_loop_start = perf_counter()
        for index in range(a.shape[0]):
            dot_loop_value += a[index] * b[index]
        t_loop_end = perf_counter()
        
        t_loop_total = t_loop_end - t_loop_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_loop_time.append(t_loop_total)
    avg_loop_time = torch.mean(torch.FloatTensor(avg_loop_time))
    print(f"Size of a, b: {size_of_array}| Dot Product| Value: {dot_loop_value:.2E}, Loop time: {avg_loop_time:.2E}s")

    avg_matmul_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        t_matmul_start = perf_counter()
        dot_np_value = a @ b.T
        t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_matmul_time.append(t_matmul_total)
    avg_matmul_time = torch.mean(torch.FloatTensor(avg_matmul_time))
    print(f"Size of a, b: {size_of_array}| Dot Product| Value: {dot_np_value:.2E}, Matmul time: {avg_matmul_time:.2E}s")

    speedUp = (avg_loop_time / avg_matmul_time)
    print(f"SpeedUp of matmul compared to loop for size {size_of_array}: {round(speedUp, 2)} x\n")

Size of a, b: 10| Dot Product| Value: 3.12E+00, Loop time: 9.32E-05s
Size of a, b: 10| Dot Product| Value: 3.12E+00, Matmul time: 4.37E-06s
SpeedUp of matmul compared to loop for size 10: 19.1 x

Size of a, b: 100| Dot Product| Value: 2.40E+01, Loop time: 7.23E-04s
Size of a, b: 100| Dot Product| Value: 2.40E+01, Matmul time: 3.39E-06s
SpeedUp of matmul compared to loop for size 100: 214.33 x

Size of a, b: 1000| Dot Product| Value: 2.44E+02, Loop time: 7.20E-03s
Size of a, b: 1000| Dot Product| Value: 2.44E+02, Matmul time: 3.61E-06s
SpeedUp of matmul compared to loop for size 1000: 2019.48 x

Size of a, b: 10000| Dot Product| Value: 2.51E+03, Loop time: 7.18E-02s
Size of a, b: 10000| Dot Product| Value: 2.51E+03, Matmul time: 4.54E-06s
SpeedUp of matmul compared to loop for size 10000: 16220.42 x

Size of a, b: 100000| Dot Product| Value: 2.49E+04, Loop time: 7.12E-01s
Size of a, b: 100000| Dot Product| Value: 2.49E+04, Matmul time: 7.79E-06s
SpeedUp of matmul compared to loop for si

### Comparing speedup for loop-based vs torch.matmul for matrix multiplication of N-D array
- Two random N-D Torch FloatTensors a, b are created each of size 10, 100, 1000, 10000, 100000
- Time required to obtain dot product using loop is calculated as mean of 20 runs to reduce variance
- The same procedure is done for torch.matmul operation -> a @ b
- Speedup is represented as multiples: Ratio of ```time_taken_for_loop / time_taken_for_matmul```

In [11]:
SIZES_OF_ARRAY = [10, 100] # , 1000, 10000]
avg_times_matmul = []
for size_of_array in SIZES_OF_ARRAY:
    a = torch.rand((size_of_array, size_of_array))
    b = torch.rand((size_of_array, size_of_array))

    avg_loop_time = []
    for _ in range(5): # Doing same computation for 20 times to reduce runtime variance
        result = []
        t_loop_start = perf_counter()
        for i in range(a.shape[0]):
            row = []
            for j in range(b.shape[1]):
                product = 0
                for k in range(a.shape[1]):
                    product += a[i][k] * b[k][j]
                row.append(product)
            result.append(row)
        
        t_loop_end = perf_counter()
        
        t_loop_total = t_loop_end - t_loop_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_loop_time.append(t_loop_total)
    avg_loop_time = np.mean(avg_loop_time)
    print(f"Size of a, b: {size_of_array}| Loop time: {avg_loop_time:.2E}s")

    avg_matmul_time = []
    for _ in range(5): # Doing same computation for 20 times to reduce runtime variance
        t_matmul_start = perf_counter()
        dot_np_value = a @ b
        t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_matmul_time.append(t_matmul_total)
    avg_matmul_time = np.mean(avg_matmul_time)
    avg_times_matmul.append(avg_matmul_time)
    print(f"Size of a, b: {size_of_array}| torch.matmul time: {avg_matmul_time:.2E}s")

    speedUp = (avg_loop_time / avg_matmul_time)
    print(f"SpeedUp of matmul for N-D tensor compared to loop for size {size_of_array}: {round(speedUp, 2)} x\n")

Size of a, b: 10| Loop time: 2.65E-02s
Size of a, b: 10| torch.matmul time: 6.70E-06s
SpeedUp of matmul for N-D tensor compared to loop for size 10: 3955.0 x

Size of a, b: 100| Loop time: 2.77E+01s
Size of a, b: 100| torch.matmul time: 7.45E-05s
SpeedUp of matmul for N-D tensor compared to loop for size 100: 372223.98 x



### torch.matmul for matrix multiplication of N-D array in GPUs/CPUs w.r.t Mixed Precision (Autocast)
- Two random N-D Torch FloatTensors a, b are created each of size 10, 100, 1000, 10000, 100000
- Time required to obtain dot product using loop is calculated as mean of 20 runs to reduce variance
- The same procedure is done for torch.matmul operation -> a @ b
- Speedup is represented as multiples: Ratio of ```time_taken_for_loop / time_taken_for_matmul```

In [3]:
SIZES_OF_ARRAY = [10, 100, 1000, 10000]
gpu_device = torch.device("cuda")

for size_of_array in SIZES_OF_ARRAY:
    a = torch.rand((size_of_array, size_of_array)).to(gpu_device)
    b = torch.rand((size_of_array, size_of_array)).to(gpu_device)
    torch.cuda.synchronize()     # Synchronizing to finish Async GPU operations

    avg_gpu_matmul_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        t_matmul_start = perf_counter()
        dot_value = a @ b # dot_np_value is float32
        torch.cuda.synchronize() # Synchronizing to finish Async GPU operations
        t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_gpu_matmul_time.append(t_matmul_total)
    avg_gpu_matmul_time = np.mean(avg_gpu_matmul_time)
    print(f"Size of a, b: {size_of_array}| torch.matmul (GPU) time: {avg_gpu_matmul_time:.2E}s")

    a = torch.rand((size_of_array, size_of_array)).to(device=gpu_device, dtype=torch.float16)
    b = torch.rand((size_of_array, size_of_array)).to(device=gpu_device, dtype=torch.float16)
    torch.cuda.synchronize()     # Synchronizing to finish Async GPU operations

    avg_gpu_autocast_matmul_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        with torch.autocast("cuda"):
            t_matmul_start = perf_counter()
            dot_amp_value = a @ b # Due to autocast, dot_np_value type is float16, input dtypes are float32
            torch.cuda.synchronize() # Synchronizing to finish Async GPU operations
            t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_gpu_autocast_matmul_time.append(t_matmul_total)
    avg_gpu_autocast_matmul_time = np.mean(avg_gpu_autocast_matmul_time)
    print(f"Size of a, b: {size_of_array}| torch.matmul (GPU + autocast) time: {avg_gpu_autocast_matmul_time:.2E}s")

    speedUp = (avg_gpu_matmul_time / avg_gpu_autocast_matmul_time)
    print(f"SpeedUp of matmul for AMP in GPU vs GPU for size {size_of_array}: {round(speedUp, 2)} x\n")

Size of a, b: 10| torch.matmul (GPU) time: 3.60E-05s
Size of a, b: 10| torch.matmul (GPU + autocast) time: 9.73E-05s
SpeedUp of matmul for AMP in GPU vs GPU for size 10: 0.37 x

Size of a, b: 100| torch.matmul (GPU) time: 7.17E-05s
Size of a, b: 100| torch.matmul (GPU + autocast) time: 5.05E-05s
SpeedUp of matmul for AMP in GPU vs GPU for size 100: 1.42 x

Size of a, b: 1000| torch.matmul (GPU) time: 4.10E-04s
Size of a, b: 1000| torch.matmul (GPU + autocast) time: 2.70E-04s
SpeedUp of matmul for AMP in GPU vs GPU for size 1000: 1.52 x

Size of a, b: 10000| torch.matmul (GPU) time: 3.24E-01s
Size of a, b: 10000| torch.matmul (GPU + autocast) time: 9.39E-02s
SpeedUp of matmul for AMP in GPU vs GPU for size 10000: 3.44 x



In [3]:
SIZES_OF_ARRAY = [10, 100, 1000, 10000]
cpu_device = torch.device("cpu")

for size_of_array in SIZES_OF_ARRAY:
    a = torch.rand((size_of_array, size_of_array)).to(cpu_device)
    b = torch.rand((size_of_array, size_of_array)).to(cpu_device)
    torch.cuda.synchronize()     # Synchronizing to finish Async GPU operations

    avg_cpu_matmul_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        t_matmul_start = perf_counter()
        dot_value = a @ b # dot_np_value is float32
        t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_cpu_matmul_time.append(t_matmul_total)
    avg_cpu_matmul_time = np.mean(avg_cpu_matmul_time)
    print(f"Size of a, b: {size_of_array}| torch.matmul (CPU) time: {avg_cpu_matmul_time:.2E}s")

    a = torch.rand((size_of_array, size_of_array)).to(device=cpu_device, dtype=torch.bfloat16)
    b = torch.rand((size_of_array, size_of_array)).to(device=cpu_device, dtype=torch.bfloat16)
    torch.cuda.synchronize()     # Synchronizing to finish Async GPU operations

    avg_cpu_autocast_matmul_time = []
    for _ in range(20): # Doing same computation for 20 times to reduce runtime variance
        with torch.autocast("cpu"):
            t_matmul_start = perf_counter()
            dot_amp_value = a @ b # Due to autocast, dot_np_value type is float16, input dtypes are float32
            t_matmul_end = perf_counter()
        
        t_matmul_total = t_matmul_end - t_matmul_start
        if _ > 3: # Ignoring first 3 loop iterations, to further reduce the noise
            avg_cpu_autocast_matmul_time.append(t_matmul_total)
    avg_cpu_autocast_matmul_time = np.mean(avg_cpu_autocast_matmul_time)
    print(f"Size of a, b: {size_of_array}| torch.matmul (CPU + autocast) time: {avg_cpu_autocast_matmul_time:.2E}s")

    speedUp = (avg_cpu_autocast_matmul_time/ avg_cpu_matmul_time)
    print(f"SpeedUp of matmul for CPU vs AMP in CPU for size {size_of_array}: {round(speedUp, 2)} x\n")

Size of a, b: 10| torch.matmul (CPU) time: 2.83E-06s
Size of a, b: 10| torch.matmul (CPU + autocast) time: 1.27E-05s
SpeedUp of matmul for CPU vs AMP in CPU for size 10: 4.49 x

Size of a, b: 100| torch.matmul (CPU) time: 3.84E-05s
Size of a, b: 100| torch.matmul (CPU + autocast) time: 1.07E-02s
SpeedUp of matmul for CPU vs AMP in CPU for size 100: 277.79 x

Size of a, b: 1000| torch.matmul (CPU) time: 1.87E-02s
Size of a, b: 1000| torch.matmul (CPU + autocast) time: 1.03E+01s
SpeedUp of matmul for CPU vs AMP in CPU for size 1000: 548.9 x

Size of a, b: 10000| torch.matmul (CPU) time: 1.13E+01s
