In [1]:
import pandas as pd
import os
import numpy as np
import re
import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from IPython.display import display, HTML

import json
from pprint import pprint
from tqdm import tqdm

from autogen_ext.models.openai import AzureOpenAIChatCompletionClient, OpenAIChatCompletionClient
from autogen_core.models import UserMessage, SystemMessage, AssistantMessage
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_agentchat.agents import AssistantAgent


import subprocess
import shlex
from io import StringIO
import re

import numpy as np

In [2]:
# GPU specs

# you can get this from deviceQuery
gpuName = 'NVIDIA RTX 3080'

# you can call nvidia-smi -i 0 -q to see what the clock is set to 
# you can also set the clock with nvidia-smi -lgc 1440,1440 for consistent measurements
# vendor specs show the base clock
baseClockHz = 1.440e9

# find these values here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
SPinstPerCyclePerSM = 128
DPinstPerCyclePerSM = 2
intInstPerCyclePerSM = 64

# find this in deviceQuery or GPU vendor specs
numSMs = 68

# we always assume you're doing FMA -- we should add another roofline for non-FMA
numFMAopPerInst = 2

# conversion multiplier
tflopPerflop = 1e-12

# get this from your GPU vendor specs, mine was 760.3 GB/s
maxBandwidthTBPerSec = 0.7603

spOPMaxPerfTFLOP = SPinstPerCyclePerSM * numSMs * baseClockHz * numFMAopPerInst * tflopPerflop
dpOPMaxPerfTFLOP = DPinstPerCyclePerSM * numSMs * baseClockHz * numFMAopPerInst * tflopPerflop
intOPMaxPerfTFLOP = intInstPerCyclePerSM * numSMs * baseClockHz * numFMAopPerInst * tflopPerflop

spOPMaxPerfTFLOP_noFMA = spOPMaxPerfTFLOP / 2
dpOPMaxPerfTFLOP_noFMA = dpOPMaxPerfTFLOP / 2

print('Max SP TFLOP/s with FMA', round(spOPMaxPerfTFLOP, 3))
print('Max DP TFLOP/s with FMA', round(dpOPMaxPerfTFLOP, 3))
print('Max SP TFLOP/s w/out FMA', round(spOPMaxPerfTFLOP_noFMA, 3))
print('Max DP TFLOP/s w/out FMA', round(dpOPMaxPerfTFLOP_noFMA, 3))
print('Max TINTOP/s', round(intOPMaxPerfTFLOP, 3))

Max SP TFLOP/s with FMA 25.068
Max DP TFLOP/s with FMA 0.392
Max SP TFLOP/s w/out FMA 12.534
Max DP TFLOP/s w/out FMA 0.196
Max TINTOP/s 12.534


### Open the Gathered Data CSV Files

In [3]:
dtypes={'Kernel Name':'string', 
        'traffic':np.float64,
        'dpAI':np.float64,
        'spAI':np.float64,
        'dpPerf':np.float64,
        'spPerf':np.float64,
        'xtime':np.float64,
        'Block Size': 'string',
        'Grid Size': 'string',
        'device': 'string',
        "intops": np.float64, 
        "intPerf" : np.float64,
        "intAI": np.float64,
        'targetName': 'string',
        'exeArgs': 'string',
        'kernelName': 'string',
        }

# we need to gather more data for this dataset
df = pd.read_csv('../roofline-data-new.csv', quotechar='"', dtype=dtypes)

# if we're loading old data that didn't gather intops
#df = pd.read_csv('../roofline-data-OLD-only-cuda.csv', quotechar='"', dtype=dtypes)
#df['intops'] = 0
#df['intPerf'] = 0
#df['intAI'] = 0

df['language'] = df['targetName'].apply(lambda x: 'CUDA' if '-cuda' in x else 'OMP')

print(df.shape)


(2970, 17)


In [4]:
# because a lot of these kernels were sampled twice, let's drop the first sample (which typically runs for longer than the second sample)
#grouped = df.groupby(by=['Kernel Name', 'kernelName', 'targetName', 'exeArgs'])['xtime'].min().reset_index()
grouped = df.groupby(by=['Kernel Name', 'targetName', 'exeArgs'])['xtime'].min().reset_index()

print(grouped.head())
print(grouped.shape)

df = df.merge(grouped, on=list(grouped.columns), how='inner')
# it turns out when we give REGEX to 'ncu' to capture kernels, some kernel names have
# extra characters that also get captured. e.g: AIDW_Kernel and AIDW_Kernel_Tiled both match for the former
# so to deal with this we mainly filter by the 'Kernel Name'  instead of 'kernelName'
#df = df.drop_duplicates(subset=['Kernel Name', 'kernelName', 'targetName', 'exeArgs', 'language'])

counts = df.groupby(['language']).count()
print(counts)
print(df.shape)

                                         Kernel Name           targetName  \
0  AESEncrypt(uchar4 *, const uchar4 *, const uch...             aes-cuda   
1  AIDW_Kernel(const float *, const float *, cons...            aidw-cuda   
2  AIDW_Kernel_Tiled(const float *, const float *...            aidw-cuda   
3  BP_queens_root_dfs(int, unsigned int, int, con...          nqueen-cuda   
4  BezierGPU(const XYZ *, XYZ *, int, int, int, int)  bezier-surface-cuda   

                                             exeArgs        xtime  
0                  100 0 ../urng-sycl/URNG_Input.bmp     549600.0  
1                                           10 1 100    3713056.0  
2                                           10 1 100    2815680.0  
3                                           15 7 100  154966912.0  
4  ../face-cuda/Face.pgm ../face-cuda/info.txt .....    4395488.0  
(1306, 4)
          Kernel Name  traffic  dpAI  spAI  dpPerf  spPerf  xtime  Block Size  \
language                              

In [5]:
print(df.shape)
print(df.columns)
print(df.dtypes)

(1378, 17)
Index(['Kernel Name', 'traffic', 'dpAI', 'spAI', 'dpPerf', 'spPerf', 'xtime',
       'Block Size', 'Grid Size', 'device', 'intops', 'intPerf', 'intAI',
       'targetName', 'exeArgs', 'kernelName', 'language'],
      dtype='object')
Kernel Name    string[python]
traffic               float64
dpAI                  float64
spAI                  float64
dpPerf                float64
spPerf                float64
xtime                 float64
Block Size     string[python]
Grid Size      string[python]
device         string[python]
intops                float64
intPerf               float64
intAI                 float64
targetName     string[python]
exeArgs        string[python]
kernelName     string[python]
language               object
dtype: object


### Let's drop rows with NULL values and fix exeArgs that got a NaN value because they didn't have exeArgs

These were executions that yielded no performance counter data (i.e: they didn't do single/double precision floating point operations).

In [6]:
df = df[df['Kernel Name'].notna()] 

print(df.shape)

# let's also replace exeArgs that are NaN with ''
df['exeArgs'] = df['exeArgs'].fillna('')

(1378, 17)


### Scale the Performance Data

Here we scale down the `spPerf` and `dpPerf` columns to be on the scale of 1e11 (like how it's done in `ncu`).

In [7]:
# scale everything to be in TFLOP/s
df['dpPerf'] = df['dpPerf']*(1e-12)
df['spPerf'] = df['spPerf']*(1e-12)
df['intPerf'] = df['intPerf']*(1e-12)


In [8]:
print(df.columns)


Index(['Kernel Name', 'traffic', 'dpAI', 'spAI', 'dpPerf', 'spPerf', 'xtime',
       'Block Size', 'Grid Size', 'device', 'intops', 'intPerf', 'intAI',
       'targetName', 'exeArgs', 'kernelName', 'language'],
      dtype='object')


In [9]:
# how many different kernel invocations did we capture?
totalExes = df.groupby(['targetName', 'device', 'exeArgs']).ngroups
print(f'Total unique executables sampled: {totalExes}')

totalKernels = df.groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
print(f'Total unique kernel executions recorded: {totalKernels}')

zeroAIKernels = df[(df['spAI'] == 0.0) & (df['dpAI'] == 0.0)].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
print(f'Number of kernels with no arithmetic intensity (AI) {zeroAIKernels} ({round(100*zeroAIKernels/totalKernels, 2)}%)')

numSPKernels = df[df['spAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
numDPKernels = df[df['dpAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups

print(f'Number of non-zero AI single-precision kernels recorded: {numSPKernels} ({round(100*numSPKernels/totalKernels, 2)}%)')
print(f'Number of non-zero AI double-precision kernels recorded: {numDPKernels} ({round(100*numDPKernels/totalKernels, 2)}%)')

numSPKernels = df[df['spPerf'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
numDPKernels = df[df['dpPerf'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups

print(f'Number of non-zero Perf single-precision kernels recorded: {numSPKernels} ({round(100*numSPKernels/totalKernels, 2)}%)')
print(f'Number of non-zero Perf double-precision kernels recorded: {numDPKernels} ({round(100*numDPKernels/totalKernels, 2)}%)')

numIntPerfKernels = df[df['intPerf'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
numIntAIKernels = df[df['intAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups


print(f'Number of non-zero Perf intop kernels recorded: {numIntPerfKernels} ({round(100*numIntPerfKernels/totalKernels, 2)}%)')
print(f'Number of non-zero AI intop kernels recorded: {numIntAIKernels} ({round(100*numIntAIKernels/totalKernels, 2)}%)')



Total unique executables sampled: 548
Total unique kernel executions recorded: 1309
Number of kernels with no arithmetic intensity (AI) 556 (42.48%)
Number of non-zero AI single-precision kernels recorded: 645 (49.27%)
Number of non-zero AI double-precision kernels recorded: 216 (16.5%)
Number of non-zero Perf single-precision kernels recorded: 645 (49.27%)
Number of non-zero Perf double-precision kernels recorded: 216 (16.5%)
Number of non-zero Perf intop kernels recorded: 1308 (99.92%)
Number of non-zero AI intop kernels recorded: 1308 (99.92%)


### Visualize the Block and Grid Sizes

In [10]:
# because the sizes are in 3D, let's convert them to 1D by multiplying them


def strTupleTo1D(strTuple):
    finds = re.findall(r'\d+', strTuple)
    nums = [int(find) for find in finds]

    assert len(nums) == 3

    mult = 1
    for num in nums:
        mult = mult*num
    return mult


df['blockSz'] = df['Block Size'].apply(strTupleTo1D).astype(int)
df['gridSz'] = df['Grid Size'].apply(strTupleTo1D).astype(int)

print(df.head())


                                         Kernel Name       traffic      dpAI  \
0  haccmk_kernel(int, int, const float *, const f...  7.156462e+07  0.000000   
1  bit_rev_permutation(long *, const long *, unsi...  4.585366e+09  0.000000   
2  bit_rev_permutation_z(long *, const long *, un...  6.496454e+09  0.000000   
3  initial_value(unsigned int, double, double, do...  2.528404e+10  9.317279   
4  solve(unsigned int, double, double, double, do...  5.187426e+11  0.563661   

          spAI    dpPerf    spPerf      xtime    Block Size      Grid Size  \
0  1868.406255  0.000000  0.133712  2990528.0   (256, 1, 1)      (4, 1, 1)   
1     0.000000  0.000000  0.000000     3936.0  (1024, 1, 1)      (1, 1, 1)   
2     0.000000  0.000000  0.000000     4512.0    (32, 1, 1)      (2, 1, 1)   
3     0.409507  0.235578  0.010354  6480864.0   (256, 1, 1)  (65536, 1, 1)   
4     0.000000  0.292395  0.000000   516416.0   (256, 1, 1)  (65536, 1, 1)   

                    device       intops   intPerf 

## Read in the JSON files with the scraped kernels

In [11]:
with open('simple-scraped-kernels-CUDA-pruned.json', 'r') as file:
    scrapedCUDA = json.load(file)

with open('simple-scraped-kernels-OMP-pruned.json', 'r') as file:
    scrapedOMP = json.load(file)


scrapedCodes = scrapedCUDA + scrapedOMP

## Match up the kernels with their JSON and save to a file

In [12]:
def chat_history_to_json_line(ctxMessages:list):
    jsonDict = {'messages':[]}
    for msg in ctxMessages:
        if type(msg) == SystemMessage:
            role = 'system'
        elif type(msg) == UserMessage:
            role = 'user'
        elif type(msg) == AssistantMessage:
            role = 'assistant'
        else:
            assert False, f'Unknown message type: {type(msg)} of {msg}'
        content = msg.content

        jsonDict['messages'].append({'role':role, 'content':content})

    return json.dumps(jsonDict, allow_nan=False)

In [13]:
def demangle_omp_kernel_name(mangledName):

    regex = r'_(?:[^_]+_){4}(.*)(_l[\d]+)'
    matches = re.finditer(regex, mangledName, re.MULTILINE)

    matches = [i for i in matches]
    assert len(matches) == 1

    cleanName = ''
    for match in matches:
        groups = match.groups()
        assert len(groups) == 2
        cleanName = groups[0]
        break


    filterCommand = f'llvm-cxxfilt {cleanName}'
    
    demangleResult = subprocess.run(filterCommand, shell=True, timeout=5, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    assert demangleResult.returncode == 0
    
    demangled = demangleResult.stdout.decode('UTF-8').strip()

    return demangled

In [14]:
# this system message combines the device information and the non-detailed few-shot
# examples. We need some few-shot breif examples of the output so that the model 
# knows how to respond.
systemMessage = '''You are a GPU performance analysis expert that classifies kernels into Arithmetic Intensity Roofline model categories based on their source code characteristics. Your task is to provide one of the following performance boundedness classifications: Compute or Bandwidth.  A kernel is considered Compute bound if its performance is primarily limited by the number of operations it performs, and Bandwidth bound if its performance is primarily limited by the rate at which data can be moved between memory and processing units.

Provide only one word as your response, chosen from the set: ['Compute', 'Bandwidth'].
**Examples:**
**Example 1:**
```
Kernel Source Code (simplified):
for i = 0 to 1000000 {
  a[i] = a[i] + b[i];
}
```
Response: Compute

**Example 2:**
```
Kernel Source Code (simplified):
for i = 0 to 10 {
  load_data(large_array);   //loads from large memory
  process_data(large_array); //processes data
  store_data(large_array);  //stores back to memory
}
```
Response: Bandwidth


Now, analyze the following source codes for the requested CUDA or OpenMP (OMP) target offload kernel of the specified hardware.'''


'''
**Example 3:**
```
Kernel Source Code (simplified):
for i = 0 to 1000 {
  vector_add(a,b,c);   //process data in situ
}
//Some smaller data movement but mostly compute.
```
Response: Compute
'''


def make_kernel_info_message(device, exeArgs, peakPerfGFLOPs, memBandwidthGBs, kernelName, blockSz, gridSz, language):
    assert kernelName != ''

    if language == 'OMP':
      cleanKName = demangle_omp_kernel_name(kernelName)
      assert cleanKName != ''
      beginPart = f'Classify the {language} kernel in function [{cleanKName}] as Bandwidth or Compute bound.'
    else:
      # if were prompting for a CUDA code
      cleanKName = kernelName
      beginPart = f'Classify the {language} kernel called [{cleanKName}] as Bandwidth or Compute bound.'

    builtPrompt = f'{beginPart} The system it will execute on is a [{device}] with a peak performance of {round(peakPerfGFLOPs,2)} GFLOP/s and a max bandwidth of {round(memBandwidthGBs,2)} GB/s. The block and grid sizes of the invoked kernel are {blockSz} and {gridSz}, respectively. The executable running this kernel is launched with '

    if exeArgs == '':
      builtPrompt += 'no command line arguments.'
    else:
      builtPrompt += f'the following command line arguments: [{exeArgs}].'

    builtPrompt += ' Below is the source code containing the kernel definition and other source code for the executable.'

    return builtPrompt

In [15]:
# the kernel name is from the "Kernel Name" column of the dataframe
async def make_chat_history(kernel_info, kernelCode):

    sys_msg = SystemMessage(content=systemMessage)
    kernel_info_msg = UserMessage(source='User', content=kernel_info)
    code_msg = UserMessage(source='User', content=f'```{kernelCode}```')
    context = UnboundedChatCompletionContext(initial_messages=[sys_msg, kernel_info_msg, code_msg])

    return context
    #messages = await context.get_messages()
    #return messages

async def make_chat_history_with_answer(kernel_info, kernelCode, answer):

    sys_msg = SystemMessage(content=systemMessage)
    kernel_info_msg = UserMessage(source='User', content=kernel_info)
    code_msg = UserMessage(source='User', content=f'```{kernelCode}```')
    assis_msg = AssistantMessage(source='assistant', content=f'{answer}')
    context = UnboundedChatCompletionContext(initial_messages=[sys_msg, kernel_info_msg, code_msg, assis_msg])

    return context


In [16]:

def writeToFile(filename, lines):
    # going to overwrite the whole file each time
    # it's redundant but the file wont be that large
    # so the speed doesn't matter
    with open(filename, 'w') as jsonLFile:
        jsonLFile.write(lines)

In [17]:
# please create a file called '.llm-api-key' with your api key and no newline characters
with open('./.llm-api-key', 'r') as file:
    LLM_API_KEY=file.read().strip()

with open('./.openrouter-api-key', 'r') as file:
    OPENROUTER_API_KEY=file.read().strip()

In [18]:

async def ask_llm_for_roofline_classification(chatHistory, useAzure=False, temp=1.0, timeout=60):

    model_client = None
    if useAzure:
        model_client = AzureOpenAIChatCompletionClient(
                model='gpt-4o-mini',

                azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
                #azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
                azure_deployment='gpt-4o-mini',
                api_key=LLM_API_KEY,
                timeout=timeout,
                temperature=temp,
                api_version='2024-08-01-preview',
                #api_version='2025-01-01-preview',
        )
    else:
        model_client = OpenAIChatCompletionClient(
                model='openai/gpt-4o-mini',
                base_url='https://openrouter.ai/api/v1',
                api_key=OPENROUTER_API_KEY,
                timeout=timeout,
                temperature=temp,
                model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}
        )

            #model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}

    agent = AssistantAgent(
        name="assistant",
        model_client=model_client,
        model_context=chatHistory
    )

    await agent.run()
    return await agent._model_context.get_messages()

In [19]:
jsonLLines = ''

peakPerfGFLOPs = spOPMaxPerfTFLOP * 1e3
memBandwidthGBs = maxBandwidthTBPerSec * 1e3

balancePointFLOPPerByte = spOPMaxPerfTFLOP / maxBandwidthTBPerSec

# for each sample we got
for index, row in tqdm(df.iterrows(), total=df.shape[0]):

    targetName = row['targetName']
    kernelName = row['Kernel Name']
    exeArgs = row['exeArgs']
    blockSz = row['Block Size']
    gridSz = row['Grid Size']
    language = row['language']
    device = row['device']
    flopAI = row['spAI']

    expectedAnswer = 'Bandwidth' if flopAI < balancePointFLOPPerByte else 'Compute'

    for elem in scrapedCodes:
        basename = elem['basename']
        if basename == targetName:
            kernelCode = list(elem['kernels'].values())[0]
            assert kernelCode != ''
            break


    infoMsg = make_kernel_info_message(device, exeArgs, peakPerfGFLOPs, memBandwidthGBs, kernelName, blockSz, gridSz, language)
    chatHist = await make_chat_history_with_answer(infoMsg, kernelCode, expectedAnswer)

    #resultHist = await ask_llm_for_roofline_classification(chatHist, useAzure=True, temp=0.1, timeout=60)
    resultStr = chat_history_to_json_line(await chatHist.get_messages())

    jsonLLines = jsonLLines + resultStr + '\n'
    writeToFile('zero-shot-FULL-TRAIN-Dataset.jsonl', jsonLLines.rstrip())



100%|██████████| 1378/1378 [00:23<00:00, 58.37it/s]


In [29]:
def make_train_test_split(jsonLFile, seed=1928):

    np.random.seed(seed)

    numLines = 0
    with open(jsonLFile, 'r') as f:
        numLines = sum(1 for _ in f)

    # create an array and shuffle it, take the top 20 percent for test and leftover as train
    shuffled = np.array(list(range(numLines)))
    np.random.shuffle(shuffled)

    top80 = int(len(shuffled)*0.8)
    bot20 = len(shuffled) - top80
    print(f'Num elems in 20% {bot20}, num elems in 80% {top80}')
    train, test = shuffled[:top80], shuffled[top80:]

    
    trainData = ''
    testData = ''
    with open(jsonLFile, 'r') as f:
        for i, line in enumerate(tqdm(f)):
            if i in train:
                trainData += line.strip() + '\n'
            else:
                testData += line.strip() + '\n'

    writeToFile(f'train-data-{top80}-1378.jsonl', trainData.rstrip())
    writeToFile(f'test-data-{bot20}-1378.jsonl', testData.rstrip())

    return

In [30]:
make_train_test_split('zero-shot-FULL-TRAIN-Dataset.jsonl')

Num elems in 20% 276, num elems in 80% 1102


1378it [00:00, 75718.58it/s]


In [31]:
def jsonl_to_context_obj(jsonLStr):
    lines = jsonLStr.split('\n')

    for line in lines:
        pass

    return