In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/10.2.89"
# set current repo as data & output path
datadir = "./"
outputdir = "./"

# Functions

In [3]:
def transpose_frame(metricdf):
    selectkeys = ["Kernel Name"]
    
    tc_peak_perf_flops = 125*10**12
    

    ####### Get timing information
    
    # get cycles and rates
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                            selectkeys+["Metric Unit", "Metric Value"]].copy().reset_index(drop=True).rename(columns={"Metric Value": "CUDA Cycles"})
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].copy().reset_index(drop=True).rename(columns={"Metric Value": "CUDA Rates"})
    
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9

    # combine & cleanup
    profiledf = pd.concat([cyclesdf, ratesdf], axis=1)
    del profiledf["Metric Unit"]
    profiledf = profiledf.loc[:,~profiledf.columns.duplicated()]
    
    # compute CUDA time
    profiledf["CUDA Time"] = profiledf["CUDA Cycles"] / profiledf["CUDA Rates"]
    
    # cleanups
    del profiledf["CUDA Cycles"]
    del profiledf["CUDA Rates"]
    profiledf = profiledf.groupby('Kernel Name').sum().reset_index()
    #display(profiledf)
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma_pred_on"), ["Metric Value"]] *= 2
    

    ### FP64 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_dadd_pred_on',
               'smsp__sass_thread_inst_executed_op_dfma_pred_on',
               'smsp__sass_thread_inst_executed_op_dmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP64 FLOPs"})

    profiledf = profiledf.merge(tmpdf, on=selectkeys, how="inner")
    
    ### FP32 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_fadd_pred_on',
               'smsp__sass_thread_inst_executed_op_ffma_pred_on',
               'smsp__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    
    profiledf = profiledf.merge(tmpdf, on=selectkeys, how="inner")
    
    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP64 FLOPs"]

    
    ####### Get number of bytes
    
    ### Shared transactions
    # project out
    shareddf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__data_pipe_lsu_wavefronts_mem_shared_op"), selectkeys+["Metric Value"] ].copy()
    shareddf = shareddf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Shared Transactions"})
    # merge with output
    profiledf = profiledf.merge(shareddf, on=selectkeys, how="inner")
    
    ### L1 atomic transactions
    # project out
    metrics = ['l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom',
               'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']
    atomicdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    atomicdf = atomicdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Atomic Transactions"})
    # merge with output
    profiledf = profiledf.merge(atomicdf, on=selectkeys, how="inner")

    ### Local transactions 
    # project out
    localdf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_local_op"), selectkeys+["Metric Value"] ].copy()
    localdf = localdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Local Transactions"})
    # merge with output
    profiledf = profiledf.merge(localdf, on=selectkeys, how="inner")
 
    ### Global transactions 
    # project out
    globaldf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_global_op"), selectkeys+["Metric Value"] ].copy()
    globaldf = globaldf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Global Transactions"})
    # merge with output
    profiledf = profiledf.merge(globaldf, on=selectkeys, how="inner")  
    
    ### L1 Bytes
    profiledf["L1 Transactions"] = (profiledf["Shared Transactions"] + profiledf["L1 Atomic Transactions"]
                            + profiledf["Local Transactions"] + profiledf["Global Transactions"])
    profiledf["L1 Bytes"] = profiledf["L1 Transactions"] * 32

    # clean up
    del profiledf["Shared Transactions"]
    del profiledf["L1 Atomic Transactions"]
    del profiledf["Local Transactions"]
    del profiledf["Global Transactions"]
        
    
    ### L2 atomic & reduction
    metricdf.loc[(metricdf["Metric Name"].str.contains("lts__t_sectors_op")) & (metricdf["Metric Type"]=="total"), ["Metric Value"]] *= 2
    
    ### L2 transactions
    # project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_op"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Transactions"})
    l2df["L2 Bytes"] = l2df["L2 Transactions"] * 32
    # merge with output
    profiledf = profiledf.merge(l2df, on=selectkeys, how="inner")

    
    ### DRAM Bytes
    # project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__sectors"), selectkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Transactions"})
    dramdf["DRAM Bytes"] = dramdf["DRAM Transactions"] * 32
    # merge with output
    profiledf = profiledf.merge(dramdf, on=selectkeys, how="inner")
    

    ### Get performance
    profiledf["Performance GFlop/s"]      = profiledf["FLOPs"]      / (profiledf["CUDA Time"]*10**9)
    profiledf["FP64 Performance GFlop/s"] = profiledf["FP64 FLOPs"] / (profiledf["CUDA Time"]*10**9)
    profiledf["FP32 Performance GFlop/s"] = profiledf["FP32 FLOPs"] / (profiledf["CUDA Time"]*10**9)

    
    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP64 L1 AI"]   = profiledf["FP64 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP64 L2 AI"]   = profiledf["FP64 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP64 DRAM AI"] = profiledf["FP64 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]

    return profiledf

# Import Data

In [4]:
# Get raw data file
filename = "metrics.log"
metricdf = pd.read_csv(os.path.join(datadir,filename))

#fuse read/write metrics together:
unique_metrics = metricdf["Metric Name"].unique()
unique_metrics = set([x.replace(".sum","").replace("_write","").replace("_read","").replace("_ld","").replace("_st","") for x in unique_metrics])
#add the metric type
metricdf["Metric Type"] = "total"
#read
metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_read"), "Metric Type" ] = "read"
metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_ld"), "Metric Type" ] = "read"
#write
metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_write"), "Metric Type" ] = "write"
metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_st"), "Metric Type" ] = "write"
#rate
metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
#rename metrics
for metric in unique_metrics:
    metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric

metricdf = metricdf[["Kernel Name", "Metric Name", "Metric Type", "Metric Unit", "Metric Value"]]
    
display(metricdf)

Unnamed: 0,Kernel Name,Metric Name,Metric Type,Metric Unit,Metric Value
0,pixelgpudetails::RawToDigi_kernel(SiPixelFedCa...,dram__sectors,read,sector,22932.0
1,pixelgpudetails::RawToDigi_kernel(SiPixelFedCa...,dram__sectors,write,sector,50046.0
2,pixelgpudetails::RawToDigi_kernel(SiPixelFedCa...,l1tex__data_pipe_lsu_wavefronts_mem_shared_op,read,,0.0
3,pixelgpudetails::RawToDigi_kernel(SiPixelFedCa...,l1tex__data_pipe_lsu_wavefronts_mem_shared_op,write,,0.0
4,pixelgpudetails::RawToDigi_kernel(SiPixelFedCa...,l1tex__t_sectors_pipe_lsu_mem_global_op,read,sector,30624.0
...,...,...,...,...,...
95995,gpuVertexFinder::vertexFinderOneKernel(ZVertex...,smsp__sass_thread_inst_executed_op_dfma_pred_on,total,inst,0.0
95996,gpuVertexFinder::vertexFinderOneKernel(ZVertex...,smsp__sass_thread_inst_executed_op_dmul_pred_on,total,inst,580.0
95997,gpuVertexFinder::vertexFinderOneKernel(ZVertex...,smsp__sass_thread_inst_executed_op_fadd_pred_on,total,inst,46053.0
95998,gpuVertexFinder::vertexFinderOneKernel(ZVertex...,smsp__sass_thread_inst_executed_op_ffma_pred_on,total,inst,10261.0


# Compute AI Results

In [5]:
profiledf = transpose_frame(metricdf)
profiledf = profiledf.sort_values('CUDA Time', ascending=False).reset_index(drop=True)

display(profiledf)

Unnamed: 0,Kernel Name,CUDA Time,FP64 FLOPs,FP32 FLOPs,FLOPs,L1 Transactions,L1 Bytes,L2 Transactions,L2 Bytes,DRAM Transactions,...,FP32 Performance GFlop/s,L1 AI,FP64 L1 AI,FP32 L1 AI,L2 AI,FP64 L2 AI,FP32 L2 AI,DRAM AI,FP64 DRAM AI,FP32 DRAM AI
0,kernel_find_ntuplets(TrackingRecHit2DSOAView c...,0.018547,0.0,0.0,0.0,221543156.0,7089381000.0,225398161.0,7212741000.0,278432471.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,void kernelBLFit<int=4>(HistoContainer<unsigne...,0.014658,265470717.0,20715170.0,286185900.0,9403323.0,300906300.0,7501232.0,240039400.0,5333291.0,...,1.413275,0.95108,0.882237,0.068843,1.192245,1.105946,0.086299,1.676884,1.555505,0.121379
2,gpuVertexFinder::vertexFinderOneKernel(ZVertex...,0.012052,56235.0,11430110.0,11486350.0,8849541.0,283185300.0,1176192.0,37638140.0,211899.0,...,0.948378,0.040561,0.000199,0.040363,0.305178,0.001494,0.303684,1.69396,0.008293,1.685667
3,gpuPixelDoublets::getDoubletsFromHisto(GPUCACe...,0.011098,0.0,545955500.0,545955500.0,339960809.0,10878750000.0,168911671.0,5405173000.0,67157969.0,...,49.192655,0.050186,0.0,0.050186,0.101006,0.0,0.101006,0.254044,0.0,0.254044
4,gpuClustering::findClus(unsigned short const *...,0.009689,0.0,0.0,0.0,150617334.0,4819755000.0,71288368.0,2281228000.0,19548934.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"kernel_connect(AtomicPairCounter*,AtomicPairCo...",0.009273,0.0,1756497000.0,1756497000.0,199460307.0,6382730000.0,92061427.0,2945966000.0,83274917.0,...,189.415546,0.275195,0.0,0.275195,0.596238,0.0,0.596238,0.659149,0.0,0.659149
6,void kernelBLFit<int=3>(HistoContainer<unsigne...,0.006044,862708818.0,60096700.0,922805500.0,29479274.0,943336800.0,25623242.0,819943700.0,12158427.0,...,9.942758,0.978236,0.914529,0.063707,1.12545,1.052156,0.073294,2.371826,2.217363,0.154463
7,gpuPixelDoublets::fishbone(TrackingRecHit2DSOA...,0.005508,0.0,129153400.0,129153400.0,55955823.0,1790586000.0,32633287.0,1044265000.0,52896500.0,...,23.446294,0.072129,0.0,0.072129,0.123679,0.0,0.123679,0.076301,0.0,0.076301
8,gpuPixelRecHits::getHits(pixelCPEforGPU::Param...,0.005237,48629316.0,175714100.0,224343400.0,74233078.0,2375458000.0,13837022.0,442784700.0,8975533.0,...,33.551715,0.094442,0.020472,0.073971,0.506665,0.109826,0.396839,0.781094,0.169312,0.611781
9,void kernelBLFastFit<int=4>(HistoContainer<uns...,0.002464,10586982.0,11021980.0,21608960.0,4044162.0,129413200.0,2676045.0,85633440.0,4435744.0,...,4.473264,0.166976,0.081808,0.085169,0.252343,0.123631,0.128711,0.152236,0.074586,0.07765


# Export Data

In [6]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))