# Parallel Computing Project

### Run the following cell to define the auxillary functions

In [22]:
import subprocess
import statistics
import re

def execute_class(impl, class_, num_iters, num_devices=None):
    cmd = f'./bin/ft_{impl}.{class_} {num_devices}'

    runtimes = []

    if num_devices is None:
        print(f'Class: {class_}, Iterations: {num_iters}')
    else:
        print(f'Class: {class_}, Num Devices: {num_devices}, Iterations: {num_iters}')

    for i in range(num_iters):
        res = subprocess.check_output(cmd.split(' ')).decode('utf-8')

        match = re.search(r'Verification\s+=\s+(\w+)', res)
        assert match, 'No match found!'
        assert match.group(1) == 'SUCCESSFUL', f'Verification = {match.group(1)}'

        match = re.search(r' Time in seconds\s+=\s+(\d+\.\d+)', res)
        assert match, 'No match found!'
        runtime = float(match.group(1))
        runtimes.append(runtime)

        runtime_mean, runtime_err = calc_stats(runtimes)
        print(f'  [{(i+1):03d}] {runtime:>8.6f} [sec] | Average Runtime: {runtime_mean:>8.6f} ± {runtime_err:.6f} [sec]')

    print()

    return runtime_mean, runtime_err

def calc_stats(runtimes):
    if len(runtimes) == 0:
        raise ValueError("runtimes array is empty")
    elif len(runtimes) == 1:
        runtime_mean = runtimes[0]
        runtime_err = 0.0
    else:
        runtime_mean = statistics.mean(runtimes)
        runtime_stdev = statistics.stdev(runtimes)

        # Filter out outliers
        if len(runtimes) > 2:
            runtimes_filtered = [runtime for runtime in runtimes if abs(runtime - runtime_mean) < runtime_stdev]
            if len(runtimes_filtered) < len(runtimes):
                runtime_mean = statistics.mean(runtimes_filtered)
                runtime_stdev = statistics.stdev(runtimes_filtered)

        runtime_err = runtime_stdev

    return runtime_mean, runtime_err

### Build CPU implementation for all problem sizes

In [2]:
IMPLS = ['single_gpu', 'single_gpu_less_mem', 'single_gpu_alloc', 'multiple_gpu']
CLASSES = ['S', 'W', 'A', 'B', 'C', 'D']

!make cleanall

for impl in IMPLS:
  for class_ in CLASSES:
    !make FT CLASS={class_} IMPL={impl}

rm -f core
rm -f */*.yaml */*.optrpt
rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
rm -f sys/setparams sys/makesuite sys/setparams.h
rm -rf bin
   =      NAS PARALLEL BENCHMARKS 4.1        =
   =      OpenMP Versions                    =
   =      C++                                =

cd FT; make CLASS=S
make[1]: Entering directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_GPU/FT'
mkdir ../bin
make[2]: Entering directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_GPU/sys'
g++ -g -fopenmp -o setparams setparams.cpp
make[2]: Leaving directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_GPU/sys'
../sys/setparams ft S
icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk/include -xCORE-AVX512 -Ofast -fiopenmp -ipo -qopt-mem-layout-trans=4 -mprefer-vector-width=512 -fopenmp-targets=spir64  -qopt-report=max  -g -gline-tables-only -fdebug-info-for-profiling -parallel-source-info=2  si

### Run Once on a Single GPU

Choose an implementation and a class to run it once on a single GPU

NOTE: Class D can't run on a single gpu due to memory constraints

In [14]:
IMPL = 'single_gpu' # one of: 'single_gpu', 'single_gpu_less_mem', 'single_gpu_alloc'
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C'

!./bin/ft_{IMPL}.{CLASS}

 Size                :  256x 256x 128
 Iterations                  :      6



 T =    1     Checksum =    5.046735008193e+02    5.114047905510e+02
 T =    2     Checksum =    5.059412319734e+02    5.098809666433e+02
 T =    3     Checksum =    5.069376896287e+02    5.098144042213e+02
 T =    4     Checksum =    5.077892868474e+02    5.101336130759e+02
 T =    5     Checksum =    5.085233095391e+02    5.104914655194e+02
 T =    6     Checksum =    5.091487099959e+02    5.107917842803e+02
 Result verification successful
 class_npb = A

 FT Benchmark Completed
 class_npb       =                        A
 Size            =            256x 256x 128
 Total threads   =                   (null)
 Iterations      =                        6
 Time in seconds =                 0.600139
 Mop/s total     =                 11891.32
 Operation type  =           floating point
 Verification    =               SUCCESSFUL



### Run Multiple Times on a Single GPU

Choose an implementation and a class to run it multiple times on a single GPU


In [16]:
IMPL = 'single_gpu' # one of: 'single_gpu', 'single_gpu_less_mem', 'single_gpu_alloc'
CLASS = 'W' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
REPETITIONS = 5

runtime_mean, runtime_err = execute_class(IMPL, CLASS, REPETITIONS)

print(f'Implementation: {IMPL}, Class: {CLASS}, Runtime: {runtime_mean:>10.6f} ± {runtime_err:>10.6f} [sec]')

Class: W, Iterations: 5


  [001] 0.476975 [sec] | Average Runtime: 0.476975 ± 0.000000 [sec]
  [002] 0.436641 [sec] | Average Runtime: 0.456808 ± 0.028520 [sec]
  [003] 0.425573 [sec] | Average Runtime: 0.431107 ± 0.007826 [sec]
  [004] 0.432707 [sec] | Average Runtime: 0.431640 ± 0.005611 [sec]
  [005] 0.435592 [sec] | Average Runtime: 0.432628 ± 0.004989 [sec]

Implementation: single_gpu, Class: W, Runtime:   0.432628 ±   0.004989 [sec]


### Run Once on Mutiple GPUs

Choose an implementation and a class to run it once on multiple GPUs

In [18]:
NUM_DEVICES = 4 # one of: 1, 2, 4
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C', 'D'

!./bin/ft_multiple_gpu.{CLASS} {NUM_DEVICES}

num_devices: 4 
 Size                :  256x 256x 128
 Iterations                  :      6



 T =    1     Checksum =    5.046735008193e+02    5.114047905510e+02
 T =    2     Checksum =    5.059412319734e+02    5.098809666433e+02
 T =    3     Checksum =    5.069376896287e+02    5.098144042213e+02
 T =    4     Checksum =    5.077892868474e+02    5.101336130759e+02
 T =    5     Checksum =    5.085233095391e+02    5.104914655194e+02
 T =    6     Checksum =    5.091487099959e+02    5.107917842803e+02
 Result verification successful
 class_npb = A

 FT Benchmark Completed
 class_npb       =                        A
 Size            =            256x 256x 128
 Total threads   =                   (null)
 Iterations      =                        6
 Time in seconds =                 1.209803
 Mop/s total     =                  5898.85
 Operation type  =           floating point
 Verification    =               SUCCESSFUL



### Run Multiple Times on a Single GPU

Choose an implementation and a class to run it multiple times on a single GPU


In [21]:
NUM_DEVICES = 4 # one of: 1, 2, 4
CLASS = 'W' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
REPETITIONS = 5

runtime_mean, runtime_err = execute_class('multiple_gpu', CLASS, REPETITIONS, NUM_DEVICES)

print(f'Num Devices: {NUM_DEVICES}, Class: {CLASS}, Runtime: {runtime_mean:>10.6f} ± {runtime_err:>10.6f} [sec]')

Class: W, num_devices: 4, Iterations: 5


  [001] 1.082067 [sec] | Average Runtime: 1.082067 ± 0.000000 [sec]
  [002] 1.048823 [sec] | Average Runtime: 1.065445 ± 0.023507 [sec]
  [003] 1.174795 [sec] | Average Runtime: 1.065445 ± 0.023507 [sec]
  [004] 1.088477 [sec] | Average Runtime: 1.073122 ± 0.021287 [sec]
  [005] 1.265921 [sec] | Average Runtime: 1.098540 ± 0.053725 [sec]

Num Devices: 4, Class: W, Runtime:   1.098540 ±   0.053725 [sec]


### Run with Different Number of Tiles

This cell is used to create the GPU figures in the report

In [23]:
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
REPETITIONS = 5
NUM_DEVICES=[1, 2, 4]

runtime_means = []
runtime_errs = []
speedup_means = []
speedup_errs = []

for num_gpu_tiles in NUM_DEVICES:
  # use 'single_gpu' implementation if only one device is used, except for class D which must use 'multiple_gpu'
  # because of memory constraints
  impl = 'single_gpu' if (num_gpu_tiles == 1 and CLASS != 'D') else 'multiple_gpu'
  runtime_mean, runtime_err = execute_class(impl, CLASS, REPETITIONS, num_gpu_tiles)
  runtime_means.append(runtime_mean)
  runtime_errs.append(runtime_err)

print("\nSummary:")
for i in range(len(NUM_DEVICES)):
  print(f'Num Devices: {NUM_DEVICES[i]:>3}, Runtime: {runtime_means[i]:>8.6f} ± {runtime_errs[i]:>8.6f} [sec]')

print("\nRuntimes:")
for i in range(len(NUM_DEVICES)):
  print(f"({NUM_DEVICES[i]},{runtime_means[i]:.6f})",end='')
print()

Class: A, Num Devices: 1, Iterations: 5


  [001] 0.517738 [sec] | Average Runtime: 0.517738 ± 0.000000 [sec]
  [002] 0.511563 [sec] | Average Runtime: 0.514651 ± 0.004366 [sec]
  [003] 0.514684 [sec] | Average Runtime: 0.516211 ± 0.002160 [sec]
  [004] 0.516267 [sec] | Average Runtime: 0.515475 ± 0.001119 [sec]
  [005] 0.527716 [sec] | Average Runtime: 0.515063 ± 0.002646 [sec]

Class: A, Num Devices: 2, Iterations: 5
  [001] 0.818746 [sec] | Average Runtime: 0.818746 ± 0.000000 [sec]
  [002] 0.771072 [sec] | Average Runtime: 0.794909 ± 0.033711 [sec]
  [003] 0.750530 [sec] | Average Runtime: 0.760801 ± 0.014525 [sec]
  [004] 0.794457 [sec] | Average Runtime: 0.782764 ± 0.016536 [sec]
  [005] 0.771735 [sec] | Average Runtime: 0.779088 ± 0.013314 [sec]

Class: A, Num Devices: 4, Iterations: 5
  [001] 1.322379 [sec] | Average Runtime: 1.322379 ± 0.000000 [sec]
  [002] 1.231507 [sec] | Average Runtime: 1.276943 ± 0.064256 [sec]
  [003] 1.168740 [sec] | Average Runtime: 1.200123 ± 0.044383 [sec]
  [004] 1.276335 [sec] | Average R