# This script processes the file `All_OptimizationsSYCL-CUDA-<hostname>.csv`  

1. Run `process_sycl_cuda_results.ipynb` to get the file `<hostname>/All_OptimizationsSYCL-CUDA-<hostname>.csv`:

2. Run the next cells in this notebook...

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
from common.utils import get_best_optimization

# get the hostname of the server
hostname = os.popen("hostname").read().strip()

df=pd.read_csv(os.path.join(hostname, f'All_Optimizations-{hostname}.csv'), sep=';')
# get Total times
df.insert(4,"Total",0)
df['Total']=df['TimeTree']+df['TimeOWM']
# get the best optimization from CPP
bestcpp,bestcpp_label = get_best_optimization(df)
print(f'Best optimization for C++: {bestcpp_label}')
# keep only Baseline and the best optimization
df = df.loc[df['Optimization'].isin(['Baseline', bestcpp_label])]
df.reset_index(drop=True, inplace=True)

dfs=pd.read_csv(os.path.join(hostname, f'All_OptimizationsSYCL-CUDA-{hostname}.csv'), sep=';')
dfs.insert(4,"Total",0)
dfs['Total']=dfs['TimeTree']+dfs['TimeOWM']
print(df)
dfs

In [None]:
# get Baseline
base = df.loc[df['Optimization'] == 'Baseline', 'TimeTree':'Total'].copy()
# get SYCL-CUDA best optimization
bestsycl, bestsycl_label = get_best_optimization(dfs)
print(f'Best SYCL-CUDA optimization: {bestsycl_label}')
b=np.array(base)
c=np.array(bestsycl)
speedupSYCL=b/c
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}x".format(x)})
#Print total speedup for each cloud (one cloud per row, one column for ech time measurement)
print("Speedup of CUDA over OMP baseline:")
print(speedupSYCL)

In [None]:
b=np.array(base)
c=np.array(bestcpp)
speedupCPP=b/c
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}x".format(x)})
#Print total speedup for each cloud (one cloud per row, one column for ech time measurement)
print("Speedup of CUDA over optim 4 TBB best execution:")
print(speedupCPP)

# Speedup of each optimization w.r.t. the previous one

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
# read SYCL-CUDA csv
dfs=pd.read_csv(os.path.join(hostname, f'All_OptimizationsSYCL-CUDA-{hostname}.csv'), sep=';')
dfs.insert(4,"Total",0)
dfs['Total']=dfs['TimeTree']+dfs['TimeOWM']
# drop all "nomemo" optimizations
dfm = dfs[~dfs["Optimization"].str.contains("nomemo")].copy()
# print(dfm)
dfm['TreeSp']=1.
dfm['OWMSp']=1.
dfm['TotalSp']=1.
dfm.reset_index(drop=True, inplace=True)
#Speedup of one optimization w.r.t. the previous one
for i in range(4,len(dfm)):
    dfm.loc[i,'TreeSp']=dfm['TimeTree'][i-4]/dfm['TimeTree'][i]
    dfm.loc[i,'OWMSp']=dfm['TimeOWM'][i-4]/dfm['TimeOWM'][i]
    dfm.loc[i,'TotalSp']=dfm['Total'][i-4]/dfm['Total'][i]
dfm

# Speedup of each optimization w.r.t. SYCL-CPU

In [None]:
# read SYCL-CUDA csv
dfs=pd.read_csv(os.path.join(hostname, f'All_OptimizationsSYCL-CUDA-{hostname}.csv'), sep=';')
dfs.insert(4,"Total",0)
dfs['Total']=dfs['TimeTree']+dfs['TimeOWM']
# drop all "nomemo" optimizations
dfm = dfs[~dfs["Optimization"].str.contains("nomemo")].copy()

dfm['TreeSp']=1.
dfm['OWMSp']=1.
dfm['TotalSp']=1.
dfm.reset_index(drop=True, inplace=True)
#Speedup of one optimization w.r.t. owm-sycl-cpu
for i in range(4,len(dfm),4):
    for j in range(4):
        dfm.loc[i+j,'TreeSp']=dfm['TimeTree'][j]/dfm['TimeTree'][i+j]
        dfm.loc[i+j,'OWMSp']=dfm['TimeOWM'][j]/dfm['TimeOWM'][i+j]
        dfm.loc[i+j,'TotalSp']=dfm['Total'][j]/dfm['Total'][i+j]
dfm


# Speedup of each optimization w.r.t. Baseline OpenMP

In [None]:
df=pd.read_csv(os.path.join(hostname, f'All_Optimizations-{hostname}.csv'), sep=';')
# get Total times
df.insert(4,"Total",0)
df['Total']=df['TimeTree']+df['TimeOWM']
# get Baseline
base = df.loc[df['Optimization'] == 'Baseline', 'TimeTree':'Total'].copy()
base.reset_index(drop=True, inplace=True)

# read SYCL-CUDA csv
dfs=pd.read_csv(os.path.join(hostname, f'All_OptimizationsSYCL-CUDA-{hostname}.csv'), sep=';')
dfs.insert(4,"Total",0)
dfs['Total']=dfs['TimeTree']+dfs['TimeOWM']
# drop all "nomemo" optimizations
dfm = dfs[~dfs["Optimization"].str.contains("nomemo")].copy()

dfm['TreeSp']=1.
dfm['OWMSp']=1.
dfm['TotalSp']=1.
dfm.reset_index(drop=True, inplace=True)
#Speedup of one optimization w.r.t. owm-sycl-cpu
for i in range(0,len(dfm),4):
    for j in range(4):
        dfm.loc[i+j,'TreeSp']=base['TimeTree'][j]/dfm['TimeTree'][i+j]
        dfm.loc[i+j,'OWMSp']=base['TimeOWM'][j]/dfm['TimeOWM'][i+j]
        dfm.loc[i+j,'TotalSp']=base['Total'][j]/dfm['Total'][i+j]
dfm

In [None]:
# keep the best version between CUDA and CUDA-grid
opts = dfm['Optimization'].unique()
if 'owm-cuda' in opts and 'owm-cuda-grid' in opts:
    if dfm.loc[dfm['Optimization'] == 'owm-cuda', 'Total'].mean() > dfm.loc[dfm['Optimization'] == 'owm-cuda-grid', 'Total'].mean():
        # drop CUDA without grid
        drop_label = 'owm-cuda'
    else:
        drop_label = 'owm-cuda-grid'
    # drop CUDA without grid
    dfm = dfm[~dfm['Optimization'].isin([drop_label])].copy()

# drop iGPU if it is present
dfm = dfm[~dfm['Optimization'].isin(['owm-sycl-igpu'])].copy()

all_a = np.array(dfm.loc[:,'TreeSp':'TotalSp'])
# get the clouds name
clouds = list(dfm.loc[dfm['Optimization'].isin([bestsycl_label]), 'Cloud'])

bycloud={}
# create a dictionary with the speedups for each cloud
for i,cloud in enumerate(clouds):
    bycloud[cloud]=all_a[i::4].T

if hostname in ['alder', 'bombay']:
    print("\\begin{tabular}{|c|ccc|}\hline")
    print(" & \multicolumn{3}{c|}{S-CPU} \\\\")  
    print("Cloud & Tree & OWM & Tot \\\\ \hline")
    for i in clouds:
        print(i,end='')
        for k in range(3):
            print("& {0:0.2f}x ".format(bycloud[i][k][0]),end='')

        print("\\\\ \hline")
    print("\\end{tabular}")
else:
    print("\\begin{tabular}{|c|ccc|ccc|ccc|ccc|}\hline")
    print(" & \multicolumn{3}{c|}{S-CPU} & \multicolumn{3}{c|}{S-dGPU} & \multicolumn{3}{c|}{S-CUDA} & \multicolumn{3}{c|}{CUDA} \\\\")  
    print("Cloud & Tree & OWM & Total & Tree & OWM & Total & Tree & OWM & Total & Tree & OWM & Total \\\\ \hline")
    for i in clouds:
        print(i,end='')
        for j in range(4):
            for k in range(3):
                print("& {0:0.2f}x ".format(bycloud[i][k][j]),end='')

        print("\\\\ \hline")
    print("\\end{tabular}")

# Speedup of each optimization w.r.t. TBB CPU optim 4

In [None]:
df=pd.read_csv(os.path.join(hostname, f'All_Optimizations-{hostname}.csv'), sep=';')
# get Total times
df.insert(4,"Total",0)
df['Total']=df['TimeTree']+df['TimeOWM']
# keep only Baseline and the best optimization
df = df.loc[df['Optimization'].isin(['Baseline', bestcpp_label])]
df.reset_index(drop=True, inplace=True)
# print(df.loc[4:7,'TimeTree':'Total'])

# read SYCL-CUDA csv
dfs=pd.read_csv(os.path.join(hostname, f'All_OptimizationsSYCL-CUDA-{hostname}.csv'), sep=';')
dfs.insert(4,"Total",0)
dfs['Total']=dfs['TimeTree']+dfs['TimeOWM']
# drop all "nomemo" optimizations
dfm = dfs[~dfs["Optimization"].str.contains("nomemo")].copy()
# keep the best version between CUDA and CUDA-grid
opts = dfm['Optimization'].unique()
if 'owm-cuda' in opts and 'owm-cuda-grid' in opts:
    if dfm.loc[dfm['Optimization'] == 'owm-cuda', 'Total'].mean() > dfm.loc[dfm['Optimization'] == 'owm-cuda-grid', 'Total'].mean():
        # drop CUDA without grid
        drop_label = 'owm-cuda'
    else:
        drop_label = 'owm-cuda-grid'
    # drop CUDA without grid
    dfm = dfm[~dfm['Optimization'].isin([drop_label])].copy()

dfm['TreeSp']=1.
dfm['OWMSp']=1.
dfm['TotalSp']=1.
dfm.reset_index(drop=True, inplace=True)
#Speedup of one optimization w.r.t. owm-sycl-cpu
for i in range(0,len(dfm),4):
    for j in range(4):
        dfm.loc[i+j,'TreeSp']=df['TimeTree'][j+4]/dfm['TimeTree'][i+j]
        dfm.loc[i+j,'OWMSp']=df['TimeOWM'][j+4]/dfm['TimeOWM'][i+j]
        dfm.loc[i+j,'TotalSp']=df['Total'][j+4]/dfm['Total'][i+j]
dfm

In [None]:
if hostname in ['alder', 'coffeelake1']:
    # how fastar than CUDA is dGPU in the OWM traversal phase
    dfcmp = dfm.loc[dfm['Optimization'].isin(['owm-sycl-dgpu', 'owm-cuda']), ['Optimization','TimeOWM']].copy()
    dfcmp.reset_index(drop=True, inplace=True)
    # get the speedup of dGPU over CUDA
    for i in range(4):
        dfcmp.loc[i+4,'speedup'] = 1.
        dfcmp.loc[i,'speedup'] = dfcmp.loc[i+4,'TimeOWM']/dfcmp.loc[i,'TimeOWM']
    print(f"Speedup of dGPU over CUDA in the OWM traversal phase: {dfcmp.loc[dfcmp['Optimization'].isin(['owm-sycl-dgpu']), 'speedup'].mean():0.3f}x")

In [None]:
if hostname in ['alder', 'coffeelake1']:
    # how slower than CUDA is dGPU in terms of total time
    dfcmp = dfm.loc[dfm['Optimization'].isin(['owm-sycl-dgpu', 'owm-cuda']), ['Optimization','Total']].copy()
    dfcmp.reset_index(drop=True, inplace=True)
    # get the speedup of dGPU over CUDA
    for i in range(4):
        dfcmp.loc[i+4,'speedup'] = 1.
        dfcmp.loc[i,'speedup'] = dfcmp.loc[i+4,'Total']/dfcmp.loc[i,'Total']
    print(f"Speedup of dGPU over CUDA in terms of total time: {dfcmp.loc[dfcmp['Optimization'].isin(['owm-sycl-dgpu']), 'speedup'].mean():0.3f}x")

In [None]:
if hostname in ['alder', 'coffeelake1']:
    # how slower than S-CUDA is dGPU in terms of total time
    dfcmp = dfm.loc[dfm['Optimization'].isin(['owm-sycl-dgpu', 'owm-sycl-cuda-dgpu']), ['Optimization','Total']].copy()
    dfcmp.reset_index(drop=True, inplace=True)
    # get the speedup of dGPU over CUDA
    for i in range(4):
        dfcmp.loc[i+4,'speedup'] = 1.
        dfcmp.loc[i,'speedup'] = dfcmp.loc[i+4,'Total']/dfcmp.loc[i,'Total']
    print(f"Speedup of dGPU over CUDA in terms of total time: {dfcmp.loc[dfcmp['Optimization'].isin(['owm-sycl-dgpu']), 'speedup'].mean():0.3f}x")
    dfcmp

In [None]:
# How much faster of TBB CPU base is the S-CPU
dfcmp = dfm.loc[dfm['Optimization'].isin(['owm-sycl-cpu'])].copy()
print(f"S-CPU outperforms TBB CPU base by {dfcmp['OWMSp'].mean():.2f}x in the OWM traversal phase")
print(f"S-CPU outperforms TBB CPU base by {dfcmp['TotalSp'].mean():.2f}x in the Total time")


# Improvement Factor for each cloud

In [None]:
def plot_allsp(df,xlab):
    #Configuration variables
    titlefs = 20
    ylabelfs = 18
    xlabelfs = 18
    xticksfs = 16
    yticksfs = 16
    legendfs = 14
    linew = 2
    markers = 8
    marks=['o-','x-','s-','v-','+-']

    if 'S-iGPU' not in xlab:
        # drop iGPU if it is present in the dataframe
        df = df[~df['Optimization'].isin(['owm-sycl-igpu'])].copy()

    all_a = np.array(df.loc[:,'TreeSp':'TotalSp'])
    _,best_label = get_best_optimization(df)
    clouds = list(df.loc[df['Optimization'].str.contains(best_label), 'Cloud'])

    bycloud={}
    for i,cloud in enumerate(clouds):
        bycloud[cloud]=all_a[i::4].T

    #fig = plt.figure()
    labels=['OWM Trav.','Tree Const.','Total']
    x=np.arange(1,len(xlab)+1)
    #define grid of plots
    fig, axs = plt.subplots(nrows=1, ncols=4,figsize=(15, 5), constrained_layout=True)#, sharey=True)
    for i,name in zip(range(len(clouds)),clouds):
        axs[i].plot(x, bycloud[name][1], marks[0], linewidth=linew, markersize=markers)
        axs[i].plot(x, bycloud[name][0], marks[1], linewidth=linew, markersize=markers)
        axs[i].plot(x, bycloud[name][2], marks[2], linewidth=linew, markersize=markers)

        axs[i].set_title(name,fontsize=16)
        axs[i].set_xlabel('Version', fontsize=xlabelfs)
        axs[i].set_xticks(x,labels=xlab,fontsize=xticksfs,rotation = 45)
        # axs[i].yticks(fontsize=yticksfs)
        axs[i].grid()
    fig.suptitle(f'Improvement factor of SYCL and CUDA versions wrt best TBB CPU (O4) @ {hostname.upper()}',  fontweight='bold', fontsize=18)
    
    axs[0].set_ylabel('Improvement Factor', fontsize=ylabelfs)
    axs[0].legend(labels,loc='best', fontsize= 14)
    pp = PdfPages(os.path.join(hostname, f"Speedup_all_CUDA-{hostname}.pdf"))
    pp.savefig(fig)
    pp.close()
    #axs[i].show()

if hostname == 'bombay':
    plot_allsp(dfm, ['S-CPU'])
elif hostname == 'alder':
    plot_allsp(dfm, ['S-CPU','S-dGPU','CUDA'])
else:
    plot_allsp(dfm, ['S-CPU','S-dGPU','S-CUDA','CUDA'])

In [None]:
def plot_allspbar(df,xlab):
    #Configuration variables
    titlefs = 20
    ylabelfs = 18
    xlabelfs = 18
    xticksfs = 16
    yticksfs = 16
    legendfs = 14
    linew = 2
    markers = 8
    marks=['o-','x-','s-','v-','+-']

    if 'S-iGPU' not in xlab:
        # drop iGPU if it is present in the dataframe
        df = df[~df['Optimization'].isin(['owm-sycl-igpu'])].copy()

    all_a = np.array(df.loc[:,'TreeSp':'TotalSp'])
    _,best_label = get_best_optimization(df)
    clouds = list(df.loc[df['Optimization'].str.contains(best_label), 'Cloud'])

    bycloud={}
    for i,cloud in enumerate(clouds):
        bycloud[cloud]=all_a[i::4].T

    #fig = plt.figure()
    labels=['TBB base','OWM Trav.','Tree Const.','Total']
    x=np.arange(1,len(xlab)+1)
    width=0.3
    xx=np.arange(1-2*width,len(xlab)+1+2*width)
    #define grid of plots
    fig, axs = plt.subplots(nrows=1, ncols=4,figsize=(15, 5), constrained_layout=True)#, sharey=True)
    for i,name in zip(range(len(clouds)),clouds):
        axs[i].bar(x-width, bycloud[name][1],width)
        axs[i].bar(x, bycloud[name][0],width)
        axs[i].bar(x+width, bycloud[name][2],width)
        axs[i].plot(xx,np.ones(len(xx)), 'k--', linewidth=linew, markersize=markers)
        axs[i].set_title(name,fontsize=16)
        axs[i].set_xlabel('Version', fontsize=xlabelfs)
        axs[i].set_xlim(1-2*width,len(xlab)+2*width)
        axs[i].set_xticks(x,labels=xlab,fontsize=xticksfs,rotation = 45)
        start, end = axs[i].get_ylim()
        axs[i].yaxis.set_ticks(np.arange(start, end, 1))
        # axs[i].yticks(fontsize=yticksfs)
        axs[i].grid()
    fig.suptitle(f'Improvement factor of SYCL and CUDA versions wrt best TBB CPU version (O4) @ {hostname.upper()}',  fontweight='bold', fontsize=18)
    
    axs[0].set_ylabel('Improvement Factor', fontsize=ylabelfs)
    axs[0].legend(labels,loc='best', fontsize= 14)
    pp = PdfPages(os.path.join(hostname, f"Speedup_all_bars_CUDA-{hostname}.pdf"))
    pp.savefig(fig)
    pp.close()
    #axs[i].show()

if hostname == 'bombay':
    plot_allspbar(dfm, ['S-CPU'])
elif hostname == 'alder':
    plot_allspbar(dfm, ['S-CPU','S-iGPU','S-dGPU','CUDA'])
else:
    plot_allspbar(dfm, ['S-CPU','S-dGPU','S-CUDA','CUDA'])
   

# Compute million of points processed per second

In [None]:
dfm[dfm['Optimization'] == bestsycl_label]

In [None]:
df[df['Optimization'] == bestcpp_label]

In [None]:
NumberOfPoints=[20380212, 40706503, 42384876, 48024480]
# dictionary to save efficiency
efficiency = {}

if hostname == 'coffeelake1':
    cpu_tdp = 95
    igpu_tdp = 15
    dgpu_tdp = 160
elif hostname == 'alder':
    cpu_tdp = 125
    igpu_tdp = 15 # https://www.techpowerup.com/gpu-specs/uhd-graphics-770.c3844
    dgpu_tdp = 200
elif hostname == 'bombay':
    cpu_tdp = 350

# select the Total times for best opt in TBB
bestcpp_df = df[df['Optimization'] == bestcpp_label]
bestcpp_df.reset_index(drop=True, inplace=True)
print(f'Throughput of best TBB CPU version (O4 - {bestcpp_label}):')
efficiency['tbb'] = []
for i in range(4):
    totaltime = bestcpp_df['Total'][i]
    efficiency['tbb'].append(NumberOfPoints[i]/totaltime/cpu_tdp)
    print('Cloud: {}, Throughput: {:.3e} points/s, Efficiency: {:.3f} Mpoints/s/W @ TDP={}W'.format(bestcpp_df['Cloud'][i], NumberOfPoints[i]/totaltime, NumberOfPoints[i]/totaltime/cpu_tdp/1e6, cpu_tdp))

# select the Total times for iGPU
scpu_df = dfm[dfm['Optimization'] == 'owm-sycl-cpu']
scpu_df.reset_index(drop=True, inplace=True)
print(f'Throughput of S-CPU version:')
for i in range(4):
    totaltime = scpu_df['Total'][i]
    print('Cloud: {}, Throughput: {:.3e} points/s, Efficiency: {:.3f} Mpoints/s/W @ TDP={}W'.format(scpu_df['Cloud'][i], NumberOfPoints[i]/totaltime, NumberOfPoints[i]/totaltime/cpu_tdp/1e6, cpu_tdp))

if hostname in ['alder', 'coffeelake1']:
    # select the Total times for iGPU
    igpu_df = dfm[dfm['Optimization'] == 'owm-sycl-igpu']
    igpu_df.reset_index(drop=True, inplace=True)
    print(f'Throughput of iGPU version:')
    efficiency['igpu'] = []
    for i in range(4):
        totaltime = igpu_df['Total'][i]
        efficiency['igpu'].append(NumberOfPoints[i]/totaltime/igpu_tdp)
        print('Cloud: {}, Throughput: {:.3e} points/s, Efficiency: {:.3f} Mpoints/s/W @ TDP={}W'.format(igpu_df['Cloud'][i], NumberOfPoints[i]/totaltime, NumberOfPoints[i]/totaltime/igpu_tdp/1e6, igpu_tdp))

    # select the Total times for best opt in SYCL-CUDA
    bestsycl_df = dfm[dfm['Optimization'] == bestsycl_label]
    bestsycl_df.reset_index(drop=True, inplace=True)
    print(f'Throughput of best GPU version ({bestsycl_label}):')
    efficiency['dgpu'] = []
    for i in range(4):
        totaltime = bestsycl_df['Total'][i]
        efficiency['dgpu'].append(NumberOfPoints[i]/totaltime/dgpu_tdp)
        print('Cloud: {}, Throughput: {:.3e} points/s, Efficiency: {:.3f} Mpoints/s/W @ TDP={}W'.format(bestsycl_df['Cloud'][i], NumberOfPoints[i]/totaltime, NumberOfPoints[i]/totaltime/dgpu_tdp/1e6, dgpu_tdp))

In [None]:
def plot_efficiency(eff):
    #Configuration variables
    titlefs = 20
    ylabelfs = 18
    xlabelfs = 18
    xticksfs = 16
    yticksfs = 16
    legendfs = 14
    linew = 2
    markers = 8
    marks=['o-','x-','s-','v-','+-']
        
    # plot the efficiency
    fig, ax = plt.subplots()
    x = np.arange(4)
    width = 0.3
    ax.bar(x-width, [x/1e6 for x in eff['tbb']], width, label='TBB CPU')
    ax.bar(x, [x/1e6 for x in eff['igpu']], width, label='S-iGPU')
    ax.bar(x+width, [x/1e6 for x in eff['dgpu']], width, label='CUDA')
    ax.set_xticks(x, labels=clouds, fontsize=xticksfs)
    ax.set_xlabel('Cloud', fontsize=xlabelfs)
    ax.set_ylabel('Efficiency [Mpoints/s/W]', fontsize=ylabelfs)
    ax.set_title('Efficiency of the different versions', fontsize=titlefs, fontweight='bold')
    ax.legend(loc='best', fontsize=legendfs)
    # plt.xticks(rotation=10)
    plt.grid()
    plt.tight_layout()
if hostname in ['alder', 'coffeelake1']:
    plot_efficiency(efficiency)