In [136]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tabulate import tabulate
import re
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import os
import sys

module_path = '/home/serinatan/project/gpgpu-sim_simulations/util/job_launching/results'
if module_path not in sys.path:
    sys.path.append(module_path)
    
import helper.help_iso as hi
import helper.format as fmt


bench_dict = {'cut_sgemm-0':0, 'cut_sgemm-1':0, 'cut_wmma-0': 0, 'cut_wmma-1': 0, 
         'parb_stencil-0': 1, 'parb_sgemm-0': 0,
         'parb_lbm-0': 1, 'parb_spmv-0': 1, 'parb_cutcp-0': 0}

mpl.style.use('seaborn-paper')





The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read CSVs

In [341]:
df_seq = pd.read_csv('seq.csv', index_col='pair_str')
df_seq_rand = pd.read_csv('seq-rand.csv')
df_intra = pd.read_csv('intra.csv')
df_inter = pd.read_csv('inter.csv')
print(df_intra.columns)

bench_list = list(df_intra['pair_str'].unique())
bench_list.sort()

Index(['pair_str', 'config', 'gpusim_version', 'jobId', 'grid_x', 'grid_y',
       'grid_z', 'block_x', 'block_y', 'block_z', 'ctas/SM', 'runtime', 'ipc',
       'instructions', 'avg_mem_lat', 'avg_core_to_l2', 'avg_l2_to_core',
       'avg_mrq_latency', 'stall_core_ldst', 'stall_icnt_to_l2', 'l2_BW',
       'l2_rshr_entry_fail', 'l2_rshr_merge_fail', 'stall_l2_to_icnt',
       'l1D_miss_rate', 'l2_miss_rate', 'l2_total_accesses', 'packet_lat_out',
       'network_lat_out', 'inject_out', 'accepted_out', 'packet_lat_in',
       'network_lat_in', 'inject_in', 'accepted_in',
       'mem_subpartition_parallism', 'mem_subpartition_parallism_util',
       'L2_reservation_fail', 'empty_warp', 'stall_warp', 'idle_warp',
       'scoreboard_warp', 'tot_warp_insn', 'regs', 'smem', 'dram_eff',
       'dram_bw', 'row_buffer_locality', 'mrqq', 'total_cmd', 'wasted_col',
       'wasted_row', 'mem_idle', 'CCDLc', 'WTRc', 'RTWc', 'RCDc', 'RCDWRc'],
      dtype='object')


In [342]:
# scale all IPC to baseline
# baseline_ipc = pd.Series(df_seq.ipc.values,index=df_seq.pair_str).to_dict() 
# baseline_runtime = pd.Series(df_seq.runtime.values,index=df_seq.pair_str).to_dict()
# baseline_mf = pd.Series(df_seq.avg_mem_lat.values,index=df_seq.pair_str).to_dict()
def normalize(df, index, metric, value, inverse):
    if inverse:
        return df.loc[index, metric] / value
    else:
        return value / df.loc[index, metric]
    
def norm_over_seq(index, metric, value, inverse=True):
    return normalize(df_seq, index, metric, value, inverse)

def norm_over_intra(index, metric, value, inverse=True):
    return normalize(df_intra, index, metric, value, inverse)
 

## Process columns

In [351]:

df_intra['norm_ipc'] = df_intra.apply(lambda row: norm_over_seq(row['pair_str'], 'ipc', row['ipc'], False), axis=1)
df_seq_rand['norm_ipc'] = df_seq_rand.apply(lambda row: norm_over_seq(row['pair_str'], 'ipc', row['ipc'], False), axis=1)
df_inter['norm_ipc'] = df_inter.apply(lambda row: norm_over_seq(row['pair_str'], 'ipc', row['ipc'], False), axis=1)

# decompose gpgpu-sim configs
hi.process_config_column('intra', 'l2', df=df_intra)
hi.process_config_column('inter', 'l2', df=df_inter)

df_intra['avg_dram_bw'] = df_intra['dram_bw'].transform(hi.avg_array)
df_inter['avg_dram_bw'] = df_inter['dram_bw'].transform(hi.avg_array)

df_intra['avg_dram_eff'] = df_intra['dram_eff'].transform(hi.avg_array)
df_inter['avg_dram_eff'] = df_inter['dram_eff'].transform(hi.avg_array)

df_intra['dram_busy'] = 1 - np.divide(df_intra['mem_idle'].transform(hi.avg_array),
                                      df_intra['total_cmd'].transform(hi.avg_array))
df_inter['dram_busy'] = 1 - np.divide(df_inter['mem_idle'].transform(hi.avg_array),
                                      df_inter['total_cmd'].transform(hi.avg_array))

idle_sum = df_intra[['empty_warp', 'idle_warp', 'scoreboard_warp']].sum(axis=1)
df_intra['comp_busy'] = df_intra['tot_warp_insn'] / (df_intra['tot_warp_insn'] + idle_sum)

idle_sum = df_inter[['empty_warp', 'idle_warp', 'scoreboard_warp']].sum(axis=1)
df_inter['comp_busy'] = df_inter['tot_warp_insn'] / (df_inter['tot_warp_insn'] + idle_sum)

max_cta_volta = 32
max_thread_volta = 2048
max_smem = 96*1024
max_register = 64*1024

df_intra['cta_ratio'] = df_intra['intra'] / max_cta_volta
threads = df_intra['intra'] * df_intra['block_x'] * df_intra['block_y'] * df_intra['block_z']
df_intra['thread_ratio'] = threads / max_thread_volta
df_intra['smem_ratio'] = df_intra['intra'] * df_intra['smem'] / max_smem
df_intra['reg_ratio'] = threads * df_intra['regs'] / max_register
# df_intra['dominant_resc'] = df_intra[['cta_ratio', 'thread_ratio', 'smem_ratio', 'reg_ratio']].max(axis=1)

def pow_2(*resc_list):
    done_first = False
    for r in resc_list:
        if done_first:
            usage = usage + df_intra[r] ** 2
        else:
            done_first = True
            usage = df_intra[r] ** 2
    return usage
            
df_intra['usage'] = pow_2('cta_ratio', 'thread_ratio', 'smem_ratio', 'reg_ratio', 'l2', 'dram_busy', 'comp_busy')


#print(df_intra.columns)
print('random baseline')
print(df_seq_rand[['pair_str','norm_ipc', 'runtime']])
print('normal baseline')
print(df_seq['runtime'])


random baseline
         pair_str           norm_ipc  runtime
0      cut_wmma-0 1.2332812143802612    51048
1    parb_cutcp-0 1.0014339415644784   524424
2      cut_wmma-1 1.5209156083106494  1484505
3     parb_spmv-0 0.9066342771541739    80745
4     cut_sgemm-1 0.9993408051832694  1720200
5    parb_sgemm-0                nan        0
6     cut_sgemm-0 0.9907045980732813    94023
7      parb_lbm-0 1.0850097692843546   807305
8  parb_stencil-0 0.8312483312353802   421345
normal baseline
pair_str
parb_sgemm-0            0
cut_wmma-1        2257808
parb_stencil-0     350242
parb_spmv-0         73206
cut_sgemm-0         93149
cut_wmma-0          62957
parb_cutcp-0       525176
cut_sgemm-1       1719066
parb_lbm-0         875934
Name: runtime, dtype: int64


  if __name__ == '__main__':


In [314]:
def print_intra(df, benchmark):
    filename = '{0}-{1}.pdf'.format(benchmark, 'intra')
    filename = os.path.join('plots', filename)
    with PdfPages(filename) as pdf:
        hi.plot_page_intra(df, 'norm_ipc', benchmark, pdf)
        hi.plot_page_intra(df, 'avg_dram_bw', benchmark, pdf)
        hi.plot_page_intra(df, 'dram_busy', benchmark, pdf)
        hi.plot_page_intra(df, 'l2_miss_rate', benchmark, pdf)
        hi.plot_page_intra(df, 'l2_BW', benchmark, pdf)
        hi.plot_page_intra(df, 'l2_total_accesses', benchmark, pdf)
        hi.plot_page_intra(df, 'l1D_miss_rate', benchmark, pdf)
        hi.plot_page_intra(df, 'avg_mem_lat', benchmark, pdf)

def print_intra_inter(df_intra, df_inter, benchmark):
    filename = '{0}-{1}.pdf'.format(benchmark, 'both')
    filename = os.path.join('plots', filename)
    with PdfPages(filename) as pdf:
        hi.plot_page_intra_inter(df_intra, df_inter, 'norm_ipc', benchmark, pdf)
 

In [78]:
print_intra_inter(df_intra, df_inter, 'parb_stencil-0')

In [79]:
print_intra_inter(df_intra, df_inter, 'parb_cutcp-0')

## Intra SM Only

In [35]:
for bench in bench_list:
    print(bench)
    print_intra(df_intra, bench)

cut_sgemm-1
cut_wmma-1
parb_stencil-0
parb_spmv-0
cut_sgemm-0
parb_cutcp-0
parb_lbm-0
cut_wmma-0


In [325]:
fig_tot, axs = plt.subplots(2, 4, figsize=(40, 30))   
axs = axs.flat

for ax, bench in zip(axs, bench_list):
    _df = df_intra[df_intra['pair_str'] == bench]
    
    hi.plot_heatmap(_df, x_key='intra', y_key='l2', z_key='norm_ipc', title=bench, axis=ax, scale=1.2)


fig_tot.suptitle('Intra, Normalized IPC', fontsize=18)
fig_tot.savefig('plots/total.pdf')
plt.close()




In [332]:
df_intra['perfdollar'] = df_intra['norm_ipc'] / df_intra['usage']
cols = ['pair_str', 'grid_x', 'grid_y', 'grid_z', 
        'perfdollar', 'intra', 'norm_ipc', 
        'runtime',
        #'usage', 'l2', 'cta_ratio', 
        #'thread_ratio', 'smem_ratio', 'reg_ratio', 
        'l2_miss_rate',
        'avg_mem_lat', 'avg_core_to_l2', 'avg_l2_to_core',
        'comp_busy', 'dram_busy', 'avg_dram_bw', 'avg_dram_eff']

sort = df_intra[cols].sort_values(['pair_str', 'perfdollar'], ascending=[True, True])
sort = sort[sort['norm_ipc'] > 0.8]

best_df = []
for bench in bench_list:
    idx = df_intra[(df_intra['norm_ipc'] > 0.8) & (df_intra['pair_str'] == bench) & (df_intra['l2'] > 0.25)]['perfdollar'].idxmax()
    best_df.append(df_intra.iloc[idx])
    
best_df = pd.concat(best_df, axis=1).T[cols]

# formatting
pd.options.display.float_format = '{:,}'.format
    
#best_df.style.set_table_styles(fmt.table_style).hide_index()\
#            .format({'norm_ipc': "{:.4f}", 'dominant_resc': '{:.2f}', 
#                     
#                    })\


## Shared

In [319]:
# modify index of dfs to uniquely access each row with keys
df_intra.set_index(['pair_str', 'intra', 'l2'], inplace=True)
df_inter.set_index(['pair_str', 'inter', 'l2'], inplace=True)


In [1]:
df_pair = pd.read_csv('pair.csv')

df_pair['avg_dram_bw'] = df_pair['dram_bw'].transform(hi.avg_array)

# split pair into two benchmarks
pair = [re.split(r'-(?=\D)', p) for p in df_pair['pair_str']]
df_bench = pd.DataFrame(pair, columns=['1_bench', '2_bench'])
df_pair = pd.concat([df_bench, df_pair], axis=1)

# extract resource allocation size
hi.process_config_column('intra', '2_intra', 'l2', '2_l2', df=df_pair)


# calculate slowdown w.r.t seq
df_pair['1_sld'] = df_pair.apply(lambda row: norm_over_seq(row['1_bench'], 'runtime', row['1_runtime']), 
                                      axis=1)
df_pair['2_sld'] = df_pair.apply(lambda row: norm_over_seq(row['2_bench'], 'runtime', row['2_runtime']), 
                                      axis=1)
df_pair['ws'] = df_pair['1_sld'] + df_pair['2_sld']

df_pair['fairness'] = np.minimum(df_pair['1_sld']/df_pair['2_sld'], 
                                 df_pair['2_sld']/df_pair['1_sld'])

# calculate slowdown w.r.t intra
df_pair['1_sld_intra'] = df_pair.apply(lambda row: norm_over_intra((row['1_bench'], row['intra'], row['l2']), 
                                                                   'runtime', 
                                                                   row['1_runtime']), 
                                      axis=1)
df_pair['2_sld_intra'] = df_pair.apply(lambda row: norm_over_intra((row['2_bench'], row['2_intra'], row['2_l2']), 
                                                                   'runtime', 
                                                                   row['2_runtime']), 
                                      axis=1)

df_pair['ws_intra'] = df_pair['1_sld_intra'] + df_pair['2_sld_intra']


# calculate mem fetch latency change
df_pair['1_norm_mflat'] = df_pair.apply(lambda row: norm_over_seq(row['1_bench'], 'avg_mem_lat',
                                                                  row['1_avg_mem_lat'], inverse=False), 
                                        axis=1)
df_pair['1_norm_mflat'] = df_pair.apply(lambda row: norm_over_seq(row['2_bench'], 'avg_mem_lat',
                                                                  row['2_avg_mem_lat'], inverse=False), 
                                        axis=1)

df_pair.sort_values(['1_bench', '2_bench'], inplace=True)
# display in a table
display_cols = ['1_bench', '2_bench', # 'intra', '2_intra', 'l2', '2_l2', 
                '1_sld', '2_sld', 'ws', 'fairness',
                '1_avg_mem_lat', '2_avg_mem_lat',
                '1_avg_core_to_l2', '2_avg_core_to_l2',
                '1_avg_l2_to_core', '2_avg_l2_to_core',
                'avg_dram_bw',
                '1_sld_intra', '2_sld_intra', 'ws_intra'
                #'norm_mflat_1', 'norm_mflat_2'
               ]
df_pair[display_cols].style.set_table_styles(fmt.table_style).hide_index()




NameError: name 'pd' is not defined

In [333]:
best_df.style.set_table_styles(fmt.table_style).hide_index()\
            .format({'norm_ipc': "{:.4f}", 'dominant_resc': '{:.2f}', 
                     
                    })\


pair_str,grid_x,grid_y,grid_z,perfdollar,intra,norm_ipc,runtime,l2_miss_rate,avg_mem_lat,avg_core_to_l2,avg_l2_to_core,comp_busy,dram_busy,avg_dram_bw,avg_dram_eff
cut_sgemm-0,16,8,1,0.669156,2,0.9614,96892,0.5468,304,137,24,0.259849,0.181625,0.129729,0.644896
cut_sgemm-1,32,16,1,1.43829,1,1.1162,1540155,0.3204,615,383,64,0.460856,0.205192,0.113583,0.428508
cut_wmma-0,16,8,1,1.68589,2,0.9463,66533,0.2217,1017,689,142,0.0559222,0.0935402,0.0432762,0.299054
cut_wmma-1,64,16,1,0.886275,2,0.9722,2322385,0.6874,1129,552,206,0.0999374,0.733181,0.501917,0.667796
parb_cutcp-0,11,11,1,3.27202,2,1.0,525176,0.0428,154,22,7,0.127457,9.28537e-05,2.21786e-05,
parb_lbm-0,120,150,1,0.700971,3,0.9121,960397,0.9996,2277,1665,34,0.0511992,0.980072,0.655588,0.667817
parb_spmv-0,1147,1,1,0.725113,6,1.0264,71326,0.8743,425,39,11,0.0759997,0.927519,0.814725,0.872437
parb_stencil-0,8,128,1,0.726218,3,0.9429,371444,0.7461,251,27,10,0.221282,0.959338,0.6521,0.671542


In [335]:
df_pair_smk = pd.read_csv('pair_smk.csv')
df_pair_smk = df_pair_smk[df_pair_smk['1_runtime'] > 0]
print(df_pair_smk.columns)

# split pair into two benchmarks
pair = np.array([re.split(r'-(?=\D)', p) for p in df_pair_smk['pair_str']])
df_pair_smk['1_bench'] = pair[:, 0]
df_pair_smk['2_bench'] = pair[:, 1]

# calculate slowdown w.r.t seq
df_pair_smk['1_sld'] = df_pair_smk.apply(lambda row: norm_over_seq(row['1_bench'], 'runtime', row['1_runtime']), 
                                      axis=1)
df_pair_smk['2_sld'] = df_pair_smk.apply(lambda row: norm_over_seq(row['2_bench'], 'runtime', row['2_runtime']), 
                                      axis=1)
df_pair_smk['ws'] = df_pair_smk['1_sld'] + df_pair_smk['2_sld']

df_pair_smk['fairness'] = np.minimum(df_pair_smk['1_sld']/df_pair_smk['2_sld'], 
                                 df_pair_smk['2_sld']/df_pair_smk['1_sld'])

df_pair_smk.sort_values(['1_bench', '2_bench'], inplace=True)
display_cols = ['1_bench', '2_bench', '1_ctas/SM', '2_ctas/SM', 
                '1_sld', '2_sld', 'ws', 'fairness', '1_runtime', '2_runtime']
df_pair_smk[display_cols].style.set_table_styles(fmt.table_style).hide_index()

Index(['pair_str', 'config', 'gpusim_version', 'jobId', '1_grid_x', '1_grid_y',
       '1_grid_z', '1_block_x', '1_block_y', '1_block_z', '1_ctas/SM',
       '1_runtime', '1_instructions', '1_avg_mem_lat', '1_avg_core_to_l2',
       '1_avg_l2_to_core', '1_avg_mrq_latency', '2_grid_x', '2_grid_y',
       '2_grid_z', '2_block_x', '2_block_y', '2_block_z', '2_ctas/SM',
       '2_runtime', '2_instructions', '2_avg_mem_lat', '2_avg_core_to_l2',
       '2_avg_l2_to_core', '2_avg_mrq_latency', 'stall_icnt_to_l2',
       'stall_l2_to_icnt', 'stall_core_ldst', 'l1D_miss_rate', 'l2_miss_rate',
       'runtime', 'l2_rshr_entry_fail', 'l2_rshr_merge_fail', 'dram_bw'],
      dtype='object')


1_bench,2_bench,1_ctas/SM,2_ctas/SM,1_sld,2_sld,ws,fairness,1_runtime,2_runtime
cut_sgemm-1,parb_lbm-0,1,6,0.180838,0.950202,1.13104,0.190316,9506103,921840
cut_sgemm-1,parb_spmv-0,1,8,0.954379,0.948842,1.90322,0.994198,1801240,77153
cut_sgemm-1,parb_stencil-0,1,8,0.663187,0.910176,1.57336,0.728636,2592129,384807
cut_wmma-0,parb_lbm-0,2,6,0.124711,0.891608,1.01632,0.139872,504824,982420
cut_wmma-0,parb_spmv-0,2,8,0.624722,0.607166,1.23189,0.971898,100776,120570
cut_wmma-0,parb_stencil-0,2,8,0.491898,0.671498,1.1634,0.732538,127988,521583
parb_cutcp-0,parb_lbm-0,2,11,0.309201,0.987677,1.29688,0.313059,1698492,886863
parb_cutcp-0,parb_spmv-0,2,14,0.879459,0.992489,1.87195,0.886115,597158,73760
parb_cutcp-0,parb_stencil-0,2,13,0.802363,0.996123,1.79849,0.805485,654537,351605
