In [1]:
from roofline_utils import *
from autogen_cuda_static_analysis_graphflow import *
from autogen_agentchat.ui import Console

import time
import argparse

import openai
import sys

import logging
from autogen_core import TRACE_LOGGER_NAME

Autogen version: 0.5.7
Max SP TFLOP/s with FMA 25.068
Max DP TFLOP/s with FMA 0.392
Max SP TFLOP/s w/out FMA 12.534
Max DP TFLOP/s w/out FMA 0.196
Max TINTOP/s 12.534
SP Balance Point is at: 32.97 flop/byte
DP Balance Point is at: 0.52 flop/byte
INT Balance Point is at: 16.49 intop/byte

These values get passed as LLM context so the model can infer about rooflines:
Peak SP GFLOP/s 25067.52 with FMA
Peak DP GFLOP/s 391.68 with FMA
Peak GINTOP/s 12533.76 with FMA
scraped and pruned CUDA programs count 297
scraped and pruned OMP  programs count 242


In [None]:
logging.basicConfig(filename='mylog.log', 
                    level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s\n')
logger = logging.getLogger(TRACE_LOGGER_NAME)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)

In [3]:
# please create a file called '.llm-api-key' with your api key and no newline characters
with open('./.llm-api-key', 'r') as file:
    LLM_API_KEY=file.read().strip()

with open('./.openrouter-api-key', 'r') as file:
    OPENROUTER_API_KEY=file.read().strip()

# ### Open the Trin/Val Data CSV Files
dtypes['language'] = 'string'
dtypes['numTokens'] = np.int64
dtypes['kernelCode'] = 'string'
dtypes['kernelSASS'] = 'string'
dtypes['isBB'] = np.int64
dtypes['class'] = 'string'
dtypes['answer'] = 'string'

reasoning_models = ['o3', 'o1', 'o4']

def is_reasoning_model(modelName):
    # the last part of the model name is the model name
    # example: openai/o3-mini-2024-11-20 is o3-mini-2024-11-20
    mName = modelName.split('/')[-1]
    for m in reasoning_models:
        if m in mName:
            return True
    return False

In [4]:

trainDF = pd.read_csv('train-dataset-balanced.csv', quotechar='"', dtype=dtypes)
valDF = pd.read_csv('validation-dataset-balanced.csv', quotechar='"', dtype=dtypes)

trainDF['isTrain'] = 1
valDF['isTrain'] = 0

df = pd.concat([trainDF, valDF], ignore_index=True)

# keep only the CUDA codes
df = df[df['language'] == 'CUDA']

print(df.shape)

(170, 24)


In [5]:
async def create_model_client(modelName, useAzure=False, temp=1.0, topp=0.1, timeout=60, storeLogProbs=False):

    model_client = None
    logprob_args = {}
    temp_topp_args = {}

    if not is_reasoning_model(modelName):
        temp_topp_args = {'temperature': temp, 'top_p': topp}
    else:
        if storeLogProbs:
            logprob_args = {'logprobs': storeLogProbs, 'top_logprobs': 4}

    if useAzure:
        model_client = AzureOpenAIChatCompletionClient(
                # https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com/openai/deployments/o1/chat/completions?api-version=2024-12-01-preview
                model=modelName,
                azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
                #azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
                azure_deployment=modelName,
                api_key=LLM_API_KEY,
                timeout=timeout,
                api_version='2025-01-01-preview',
                **temp_topp_args,
                **logprob_args,
                model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'family':'unknown'}
        )
    else:
        model_client = OpenAIChatCompletionClient(
                #model='openai/gpt-4o-mini',
                #model='openai/gpt-4o-mini-2024-07-18',
                #model='google/gemini-2.0-flash-001',
                #model='openai/o3-mini',
                #model='openai/gpt-4o-2024-11-20',
                #model='deepseek/deepseek-r1',
                #model='openai/o3-mini-high',
                #model='openai/o1-mini-2024-09-12',
                model=modelName,
                base_url='https://openrouter.ai/api/v1',
                api_key=OPENROUTER_API_KEY,
                timeout=timeout,
                # comment these back in for the non-reasoning models
                #top_p = topp,
                #temperature=temp,
                **temp_topp_args,
                **logprob_args,
                model_info = {'vision':False, 'function_calling':False, 'json_output':False, 'family':'unknown'}
        )

    return model_client

In [6]:
model_client = await create_model_client(modelName='openai/o4-mini', timeout=300)

  validate_model_info(self._model_info)


In [7]:
workflow = build_graphflow(model_client)

In [8]:
for index, row in df.iterrows():
    #sassCode = row['kernelSASS']

    input_args = {
         "source_code": row['kernelCode'],
         "kernel_name": row['Kernel Name'],
         "program_name": row['targetName'],
         "exec_args": row['exeArgs'],
         "grid_size": row['Grid Size'],
         "block_size": row['Block Size'],
         "device": row['device'],
     }

    break


In [9]:
#You are an expert CUDA programmer and you are given a CUDA kernel source code.
#Your task is to analyze the kernel and provide a detailed report on its performance characteristics.
#Please provide your analysis in a structured format.

def make_input_prompt(input_args):
    prompt = f"""

Kernel Name: {input_args['kernel_name']}
Program Name: {input_args['program_name']}
Execution Arguments: [{input_args['exec_args']}]
Grid Size: {input_args['grid_size']}
Block Size: {input_args['block_size']}

GPU Hardware Specs:
 - GPU Name: {input_args['device']}
 - Compute Capbility: 86
 - RAM: 10 GB
 - SM count: 68
 - Max Bandwidth: 760.3 GB/s
 - Peak SP GFLOP/s 25067.52 with FMA
 - Peak DP GFLOP/s 391.68 with FMA
 - Peak GINTOP/s 12533.76 with FMA


Source Code:
{input_args['source_code']} """
    return prompt

In [10]:
result = workflow.run_stream(task=make_input_prompt(input_args))

In [11]:
await Console(result)

---------- TextMessage (user) ----------


Kernel Name: void resize<unsigned char, 8>(T1 *, unsigned long, int, int, const T1 *, int, int, float, float, bool, bool)
Program Name: resize-cuda
Execution Arguments: [1920 1080 256 256 512 100]
Grid Size: (29184, 1, 1)
Block Size: (256, 1, 1)

GPU Hardware Specs:
 - GPU Name: NVIDIA GeForce RTX 3080
 - Compute Capbility: 86
 - RAM: 10 GB
 - SM count: 68
 - Max Bandwidth: 760.3 GB/s
 - Peak SP GFLOP/s 25067.52 with FMA
 - Peak DP GFLOP/s 391.68 with FMA
 - Peak GINTOP/s 12533.76 with FMA


Source Code:
-----------------------------------
main.cu
-----------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <chrono>
#include <cuda.h>

template <class T, std::size_t CHANNELS_PER_ITER>
__global__
void resize (
    T *__restrict__ output,
    size_t output_size, int out_height, int out_width,
    const T *__restrict__ input, int in_height, int in_width,
    float o2i_fy, float o2i_fx, bool round, bool half

  model_result = await model_client.create(


---------- TextMessage (DummyInitialRequestAgent) ----------



RuntimeError: RuntimeError: No available speakers found.
Traceback:
Traceback (most recent call last):

  File "/Users/gbolet/miniconda3/envs/autogen5.7/lib/python3.11/site-packages/autogen_agentchat/teams/_group_chat/_base_group_chat_manager.py", line 165, in handle_agent_response
    speaker_name = await speaker_name_future
                   ^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/gbolet/miniconda3/envs/autogen5.7/lib/python3.11/site-packages/autogen_agentchat/teams/_group_chat/_graph/_digraph_group_chat.py", line 357, in select_speaker
    raise RuntimeError("No available speakers found.")

RuntimeError: No available speakers found.
