# Parallel Computing Project

### Run the following cell to define the auxillary functions

In [1]:
import subprocess
import statistics
import re
from math import sqrt

def execute_class(class_, num_iters):
    cmd = f'./bin/ft.{class_}'

    runtimes = []

    print(f'Class: {class_}, Iterations: {num_iters}')

    for i in range(num_iters):
        res = subprocess.check_output(cmd.split(' ')).decode('utf-8')

        match = re.search(r'Verification\s+=\s+(\w+)', res)
        assert match, 'No match found!'
        assert match.group(1) == 'SUCCESSFUL', f'Verification = {match.group(1)}'

        match = re.search(r' Time in seconds\s+=\s+(\d+\.\d+)', res)
        assert match, 'No match found!'
        runtime = float(match.group(1))
        runtimes.append(runtime)

        runtime_mean, runtime_err = calc_stats(runtimes)
        print(f'  [{(i+1):03d}] {runtime:>10.6f} [sec] | Average Runtime: {runtime_mean:>10.6f} ± {runtime_err:.6f} [sec]')

    print()

    return runtime_mean, runtime_err


def calc_speedup(runtime_mean, runtime_err, ref_mean, ref_err):
    speedup_mean = ref_mean / runtime_mean
    speedup_err = sqrt((runtime_err/runtime_mean)**2 + (ref_err/ref_mean)**2) * abs(speedup_mean)

    # print(f'Runtime: {runtime_mean:.6f} ± {runtime_err:.6f} [sec]')
    # print(f'Speedup: {speedup_mean:>8.2f} ± {speedup_err:>8.2f}')

    return speedup_mean, speedup_err


def calc_stats(runtimes):
    if len(runtimes) == 0:
        raise ValueError("runtimes array is empty")
    elif len(runtimes) == 1:
        runtime_mean = runtimes[0]
        runtime_err = 0.0
    else:
        runtime_mean = statistics.mean(runtimes)
        runtime_stdev = statistics.stdev(runtimes)

        # Filter out outliers
        if len(runtimes) > 2:
            runtimes_filtered = [runtime for runtime in runtimes if abs(runtime - runtime_mean) < runtime_stdev]
            if len(runtimes_filtered) < len(runtimes):
                runtime_mean = statistics.mean(runtimes_filtered)
                runtime_stdev = statistics.stdev(runtimes_filtered)

        runtime_err = runtime_stdev

    return runtime_mean, runtime_err

### Build CPU implementation for all problem sizes

In [5]:
!make cleanall
!make FT CLASS=S
!make FT CLASS=W
!make FT CLASS=A
!make FT CLASS=B
!make FT CLASS=C
!make FT CLASS=D

rm -f core
rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
rm -f sys/setparams sys/makesuite sys/setparams.h
rm -r bin/*
   =      NAS PARALLEL BENCHMARKS 4.1        =
   =      OpenMP Versions                    =
   =      C++                                =

cd FT; make CLASS=S
make[1]: Entering directory '/home/u87ec7d5f07e9cff6342867a22f07aef/multi-core-processing-project/ours_CPU/FT'
make[2]: Entering directory '/home/u87ec7d5f07e9cff6342867a22f07aef/multi-core-processing-project/ours_CPU/sys'
g++ -g -fopenmp -o setparams setparams.cpp
make[2]: Leaving directory '/home/u87ec7d5f07e9cff6342867a22f07aef/multi-core-processing-project/ours_CPU/sys'
../sys/setparams ft S
icx  -c -I../common -I/opt/intel/oneapi/vtune/2024.1/sdk/include -xCORE-AVX512 -Ofast -fiopenmp -ipo -qopt-mem-layout-trans=4 -mprefer-vector-width=512 -fopenmp-targets=spir64  -qopt-report=max  -g -gline-tables-only -fdebug-info-for-profiling  ft.c
cd ../common; icx  -c -I../common -I/opt/intel/oneapi/vtune

### Strong Scaling

In [3]:
CLASS = 'A' # one of: 'S', 'W', 'A', 'B', 'C', 'D'
REPETITIONS = 2
NUM_THREADS=[1, 2, 4, 8, 16, 32, 56, 112, 224] # max 224

runtime_means = []
runtime_errs = []

for num_threads in NUM_THREADS:
  %env OMP_NUM_THREADS={num_threads}
  runtime_mean, runtime_err = execute_class(CLASS, REPETITIONS)

  runtime_means.append(runtime_mean)
  runtime_errs.append(runtime_err)

speedup_means = []
speedup_errs = []

for mean, err in zip(runtime_means, runtime_errs):
  speedup_mean, speedup_err = calc_speedup(runtime_mean, runtime_err, runtime_means[0], runtime_errs[0])
  speedup_means.append(speedup_mean)
  speedup_errs.append(speedup_err)

print("\nSummary:")
for i in range(len(NUM_THREADS)):
  print(f'Threads: {NUM_THREADS[i]:>3}, Runtime: {runtime_means[i]:>10.6f} ± {runtime_errs[i]:>10.6f} [sec], Speedup: {speedup_means[i]:>8.2f} ± {speedup_errs[i]:>8.2f}')

print("\nRuntimes:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{runtime_means[i]:.6f})",end='')
print()

print("\nSpeedups:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{speedup_means[i]:.2f})",end='')
print()


env: OMP_NUM_THREADS=1
Class: A, Iterations: 2


KeyboardInterrupt: 

### Weak Scaling

In [6]:
# CLASSES = ['S', 'W', 'A']
# NUM_THREADS=[1, 2, 32]

CLASSES = ['W', 'A', 'B']
NUM_THREADS=[1, 16, 214]

# CLASSES = ['A', 'B', 'C']
# NUM_THREADS=[3, 40, 160]

# CLASSES = ['B', 'C', 'D']
# NUM_THREADS=[1, 40, 80]

REPETITIONS = 10

runtime_means = []
runtime_errs = []

for class_, num_threads in zip(CLASSES, NUM_THREADS):
  %env OMP_NUM_THREADS={num_threads}
  runtime_mean, runtime_err = execute_class(class_, REPETITIONS)

  runtime_means.append(runtime_mean)
  runtime_errs.append(runtime_err)

efficiency_means = []
efficiency_errs = []

for runtime_mean, runtime_err in zip(runtime_means, runtime_errs):
  efficiency_mean, efficiency_err = calc_speedup(runtime_mean, runtime_err, runtime_means[0], runtime_errs[0])
  efficiency_means.append(efficiency_mean)
  efficiency_errs.append(efficiency_err)

print("\nSummary:")
for i in range(len(NUM_THREADS)):
  print(f'Class: {CLASSES[i]}, Threads: {NUM_THREADS[i]:>3}, Runtime: {runtime_means[i]:>10.6f} ± {runtime_errs[i]:>10.6f} [sec], Efficiency: {efficiency_means[i]:>8.2f} ± {efficiency_errs[i]:>8.2f}')

print("\nRuntimes:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{runtime_means[i]:.6f})",end='')
print()

print("\nEfficiency:")
for i in range(len(NUM_THREADS)):
  print(f"({NUM_THREADS[i]},{efficiency_means[i]:.2f})",end='')
print()


env: OMP_NUM_THREADS=1
Class: W, Iterations: 10


KeyboardInterrupt: 

In [16]:
class_ = 'C'
num_iters = 100

runtime_mean, runtime_err = execute_class(class_, num_iters)

print(f'runtime_mean: {runtime_mean:.6}')
print(f'runtime_err : {runtime_err:.6}')

# ours: 3.304523 ± 0.630665

Class: C, Iterations: 100


  [001] Runtime: 12.340323 ± 0.000000 [sec] (12.340323)
  [002] Runtime: 12.148646 ± 0.542143 [sec] (11.956970)
  [003] Runtime: 12.074489 ± 0.461467 [sec] (11.926174)
  [004] Runtime: 13.071470 ± 4.005685 [sec] (16.062414)


KeyboardInterrupt: 

In [10]:
# Iterations:        1000           1000           200            100            50              2
ref_mean_by_class = {'S': 0.000000, 'W': 0.000000, 'A': 0.000000, 'B': 2.913119, 'C': 0.000000, 'D': 0.000000}
ref_err_by_class  = {'S': 0.000000, 'W': 0.000000, 'A': 0.000000, 'B': 0.629813, 'C': 0.000000, 'D': 0.000000}

class_ = 'B'
num_iters = 50

ref_mean = ref_mean_by_class[class_]
ref_err = ref_err_by_class[class_]

_ = benchmark_class(class_, num_iters, ref_mean, ref_err)

Class: B, Iterations: 50


  [001] Runtime: 3.298517 ± 0.000000 [sec] (3.298517)
  [002] Runtime: 3.089596 ± 0.590916 [sec] (2.880676)
  [003] Runtime: 3.125022 ± 0.435489 [sec] (3.195872)
  [004] Runtime: 3.032199 ± 0.514090 [sec] (2.753733)
  [005] Runtime: 3.110229 ± 0.565677 [sec] (3.422349)


KeyboardInterrupt: 