# Parallel Computing Project

### Run the following cell to define the auxillary functions

In [1]:
import subprocess
import statistics
import re
from math import sqrt

def execute_class(class_, num_iters):
    cmd = f'./bin/ft.{class_}'

    runtimes = []

    print(f'Class: {class_}, Iterations: {num_iters}')

    for i in range(num_iters):
        res = subprocess.check_output(cmd.split(' ')).decode('utf-8')

        match = re.search(r'Verification\s+=\s+(\w+)', res)
        assert match, 'No match found!'
        assert match.group(1) == 'SUCCESSFUL', f'Verification = {match.group(1)}'

        match = re.search(r' Time in seconds\s+=\s+(\d+\.\d+)', res)
        assert match, 'No match found!'
        runtime = float(match.group(1))
        runtimes.append(runtime)

        runtime_mean, runtime_err = calc_stats(runtimes)
        print(f'  [{(i+1):03d}] {runtime:>10.6f} [sec] | Average Runtime: {runtime_mean:>10.6f} ± {runtime_err:.6f} [sec]')

    print()

    return runtime_mean, runtime_err


def calc_speedup(runtime_mean, runtime_err, ref_mean, ref_err):
    speedup_mean = ref_mean / runtime_mean
    speedup_err = sqrt((runtime_err/runtime_mean)**2 + (ref_err/ref_mean)**2) * abs(speedup_mean)

    # print(f'Runtime: {runtime_mean:.6f} ± {runtime_err:.6f} [sec]')
    # print(f'Speedup: {speedup_mean:>8.2f} ± {speedup_err:>8.2f}')

    return speedup_mean, speedup_err


def calc_stats(runtimes):
    if len(runtimes) == 0:
        raise ValueError("runtimes array is empty")
    elif len(runtimes) == 1:
        runtime_mean = runtimes[0]
        runtime_err = 0.0
    else:
        runtime_mean = statistics.mean(runtimes)
        runtime_stdev = statistics.stdev(runtimes)

        # Filter out outliers
        if len(runtimes) > 2:
            runtimes_filtered = [runtime for runtime in runtimes if abs(runtime - runtime_mean) < runtime_stdev]
            if len(runtimes_filtered) < len(runtimes):
                runtime_mean = statistics.mean(runtimes_filtered)
                runtime_stdev = statistics.stdev(runtimes_filtered)

        runtime_err = runtime_stdev

    return runtime_mean, runtime_err

### Build CPU implementation for all problem sizes

In [2]:
!make cleanall
!make FT CLASS=S
!make FT CLASS=W
!make FT CLASS=A
!make FT CLASS=B
!make FT CLASS=C
!make FT CLASS=D

rm -f core
rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
rm -f sys/setparams sys/makesuite sys/setparams.h
rm -r bin/*
   =      NAS PARALLEL BENCHMARKS 4.1        =
   =      OpenMP Versions                    =
   =      C++                                =

cd FT; make CLASS=S
make[1]: Entering directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_CPU/FT'
make[2]: Entering directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_CPU/sys'
g++ -g -fopenmp -o setparams setparams.cpp


make[2]: Leaving directory '/home/u10d4e4582d9934dcdf4884edcbb3454/multi-core-processing-project/ours_CPU/sys'
../sys/setparams ft S
icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk/include -xCORE-AVX512 -Ofast -fiopenmp -ipo -qopt-mem-layout-trans=4 -mprefer-vector-width=512 -fopenmp-targets=spir64  -qopt-report=max  -g -gline-tables-only -fdebug-info-for-profiling  ft.c
cd ../common; icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk/include -xCORE-AVX512 -Ofast -fiopenmp -ipo -qopt-mem-layout-trans=4 -mprefer-vector-width=512 -fopenmp-targets=spir64  -qopt-report=max  -g -gline-tables-only -fdebug-info-for-profiling  c_randdp.c
cd ../common; icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk/include -xCORE-AVX512 -Ofast -fiopenmp -ipo -qopt-mem-layout-trans=4 -mprefer-vector-width=512 -fopenmp-targets=spir64  -qopt-report=max  -g -gline-tables-only -fdebug-info-for-profiling  c_print_results.c
cd ../common; icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk

### Run Once

In [19]:
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C', 'D'

!./bin/ft.{CLASS}

 Size                :  256x 256x 128
 Iterations                  :      6



 T =    1     Checksum =    5.046735008193e+02    5.114047905510e+02
 T =    2     Checksum =    5.059412319734e+02    5.098809666433e+02
 T =    3     Checksum =    5.069376896287e+02    5.098144042213e+02
 T =    4     Checksum =    5.077892868474e+02    5.101336130759e+02
 T =    5     Checksum =    5.085233095391e+02    5.104914655194e+02
 T =    6     Checksum =    5.091487099959e+02    5.107917842803e+02
 Result verification successful
 class_npb = A


 FT Benchmark Completed
 class_npb       =                        A
 Size            =            256x 256x 128
 Total threads   =                      244
 Iterations      =                        6
 Time in seconds =                 0.071966
 Mop/s total     =                 99164.19
 Operation type  =           floating point
 Verification    =               SUCCESSFUL
 Version         =                      4.1
 Compile date    =              18 May 2024
 Compiler ver    =                   11.4.0
 OpenMP version  =           

### Run Multiple Times

In [12]:
CLASS = 'W' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
NUM_THREADS=244 # max 244
REPETITIONS = 5

%env OMP_NUM_THREADS={NUM_THREADS}
runtime_mean, runtime_err = execute_class(CLASS, REPETITIONS)

print(f'Class: {CLASS}, Threads: {NUM_THREADS:>3}, Runtime: {runtime_mean:>10.6f} ± {runtime_err:>10.6f} [sec]')

env: OMP_NUM_THREADS=244
Class: W, Iterations: 5
  [001]   0.023795 [sec] | Average Runtime:   0.023795 ± 0.000000 [sec]
  [002]   0.013941 [sec] | Average Runtime:   0.018868 ± 0.006968 [sec]
  [003]   0.021969 [sec] | Average Runtime:   0.022882 ± 0.001291 [sec]
  [004]   0.010377 [sec] | Average Runtime:   0.019902 ± 0.005242 [sec]
  [005]   0.025989 [sec] | Average Runtime:   0.019902 ± 0.005242 [sec]

Class: W, Threads: 244, Runtime:   0.019902 ±   0.005242 [sec]


### Strong Scaling

Run a single class using different number of threads

In [21]:
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
REPETITIONS = 5
NUM_THREADS=[1, 2, 4, 8, 16, 32, 56, 112, 224] # max 224

runtime_means = []
runtime_errs = []

for num_threads in NUM_THREADS:
  %env OMP_NUM_THREADS={num_threads}
  runtime_mean, runtime_err = execute_class(CLASS, REPETITIONS)

  runtime_means.append(runtime_mean)
  runtime_errs.append(runtime_err)

speedup_means = []
speedup_errs = []

for mean, err in zip(runtime_means, runtime_errs):
  speedup_mean, speedup_err = calc_speedup(runtime_mean, runtime_err, runtime_means[0], runtime_errs[0])
  speedup_means.append(speedup_mean)
  speedup_errs.append(speedup_err)

print("\nSummary:")
for i in range(len(NUM_THREADS)):
  print(f'Threads: {NUM_THREADS[i]:>3}, Runtime: {runtime_means[i]:>10.6f} ± {runtime_errs[i]:>10.6f} [sec], Speedup: {speedup_means[i]:>8.2f} ± {speedup_errs[i]:>8.2f}')

print("\nRuntimes:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{runtime_means[i]:.6f})",end='')
print()

print("\nSpeedups:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{speedup_means[i]:.2f})",end='')
print()


env: OMP_NUM_THREADS=1
Class: A, Iterations: 5


  [001]   1.722540 [sec] | Average Runtime:   1.722540 ± 0.000000 [sec]
  [002]   1.739356 [sec] | Average Runtime:   1.730948 ± 0.011891 [sec]
  [003]   1.760641 [sec] | Average Runtime:   1.730948 ± 0.011891 [sec]
  [004]   1.722552 [sec] | Average Runtime:   1.728149 ± 0.009705 [sec]
  [005]   1.764493 [sec] | Average Runtime:   1.736272 ± 0.018075 [sec]

env: OMP_NUM_THREADS=2
Class: A, Iterations: 5
  [001]   0.898213 [sec] | Average Runtime:   0.898213 ± 0.000000 [sec]
  [002]   0.883925 [sec] | Average Runtime:   0.891069 ± 0.010103 [sec]
  [003]   0.885484 [sec] | Average Runtime:   0.884705 ± 0.001102 [sec]
  [004]   0.884944 [sec] | Average Runtime:   0.884784 ± 0.000792 [sec]
  [005]   0.888815 [sec] | Average Runtime:   0.885792 ± 0.002116 [sec]

env: OMP_NUM_THREADS=4
Class: A, Iterations: 5
  [001]   0.432542 [sec] | Average Runtime:   0.432542 ± 0.000000 [sec]
  [002]   0.430286 [sec] | Average Runtime:   0.431414 ± 0.001595 [sec]
  [003]   0.431601 [sec] | Average Runti

### Weak Scaling

Uncomment one of the variable pairs CLASSES, NUM_THREADS and run the cell

Each pair describes a problem size ratio between 3 problem such that the ratio is less than the maximal number of threads

In [8]:
CLASSES = ['S', 'W', 'A']
NUM_THREADS=[1, 2, 32]

# CLASSES = ['W', 'A', 'B']
# NUM_THREADS=[1, 16, int(213.33)]

# CLASSES = ['A', 'B', 'C']
# NUM_THREADS=[1, int(13.33), int(53.33)]

# CLASSES = ['B', 'C', 'D']
# NUM_THREADS=[1, 4, 80]

REPETITIONS = 5

runtime_means = []
runtime_errs = []

for class_, num_threads in zip(CLASSES, NUM_THREADS):
  %env OMP_NUM_THREADS={num_threads}
  runtime_mean, runtime_err = execute_class(class_, REPETITIONS)

  runtime_means.append(runtime_mean)
  runtime_errs.append(runtime_err)

efficiency_means = []
efficiency_errs = []

for runtime_mean, runtime_err in zip(runtime_means, runtime_errs):
  efficiency_mean, efficiency_err = calc_speedup(runtime_mean, runtime_err, runtime_means[0], runtime_errs[0])
  efficiency_means.append(efficiency_mean)
  efficiency_errs.append(efficiency_err)

print("\nSummary:")
for i in range(len(NUM_THREADS)):
  print(f'Class: {CLASSES[i]}, Threads: {NUM_THREADS[i]:>3}, Runtime: {runtime_means[i]:>10.6f} ± {runtime_errs[i]:>10.6f} [sec], Efficiency: {efficiency_means[i]:>8.2f} ± {efficiency_errs[i]:>8.2f}')

print("\nRuntimes:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{runtime_means[i]:.6f})",end='')
print()

print("\nEfficiency:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{efficiency_means[i]:.2f})",end='')
print()


env: OMP_NUM_THREADS=1
Class: B, Iterations: 5


  [001]  22.716913 [sec] | Average Runtime:  22.716913 ± 0.000000 [sec]
  [002]  22.719948 [sec] | Average Runtime:  22.718431 ± 0.002146 [sec]
  [003]  24.460083 [sec] | Average Runtime:  22.718431 ± 0.002146 [sec]
  [004]  23.055795 [sec] | Average Runtime:  22.830885 ± 0.194783 [sec]
  [005]  23.804521 [sec] | Average Runtime:  23.074294 ± 0.512138 [sec]

env: OMP_NUM_THREADS=4
Class: C, Iterations: 5
  [001]  32.077306 [sec] | Average Runtime:  32.077306 ± 0.000000 [sec]
  [002]  30.444476 [sec] | Average Runtime:  31.260891 ± 1.154585 [sec]
  [003]  32.735093 [sec] | Average Runtime:  32.406199 ± 0.465126 [sec]
  [004]  31.910647 [sec] | Average Runtime:  32.241015 ± 0.435922 [sec]
  [005]  32.990761 [sec] | Average Runtime:  32.428452 ± 0.516929 [sec]


Summary:
Class: B, Threads:   1, Runtime:  23.074294 ±   0.512138 [sec], Efficiency:     1.00 ±     0.03
Class: C, Threads:   4, Runtime:  32.428452 ±   0.516929 [sec], Efficiency:     0.71 ±     0.02

Runtimes:
(1,23.074294)(4,32