### System Prompt Crafting

This notebook will showcase our attempts at crafting some system prompts and running them through the models we have access to through OpenRouter. Our primary objective is to see how current SoTA LLMs respond to our queries so that we can find out which system prompts give the most consistent responses back.

In [1]:
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import UserMessage, SystemMessage, AssistantMessage
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.ui import Console
import json
from pprint import pprint

In [2]:
# please create a file called '.openrouter-api-key' with your api key and no newline characters
with open('./.openrouter-api-key', 'r') as file:
    OPENROUTER_API_KEY=file.read()

In [3]:
model_client = OpenAIChatCompletionClient(
        model='google/gemini-flash-1.5',
        #'model' : 'google/gemini-pro',
        base_url='https://openrouter.ai/api/v1',
        api_key=OPENROUTER_API_KEY,
        timeout=30,
        temperature=1.0,
        model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}
)

In [4]:

sys_msg = SystemMessage(content='You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.')
code_msg = UserMessage(source='User', content='__global__ void stencil(float* in, float* out, int n) {\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx > 0 && idx < n - 1) {\n        out[idx] = 0.25f * (in[idx - 1] + in[idx] + in[idx + 1]);\n    }\n}')

context = UnboundedChatCompletionContext(initial_messages=[sys_msg, code_msg])

In [5]:
# we want to see if we can create a message history for the model to ingest
agent = AssistantAgent(
    name="assistant",
    model_client=model_client,
    model_context=context
)

In [6]:
result = await agent.run()

print(result)

TaskResult(messages=[TextMessage(source='assistant', models_usage=RequestUsage(prompt_tokens=163, completion_tokens=7), content='Memory-Bound\nTERMINATE\n', type='TextMessage')], stop_reason=None)


In [7]:
print(await context.get_messages())

[SystemMessage(content='You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.', type='SystemMessage'), UserMessage(content='__global__ void stencil(float* in, float* out, int n) {\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx > 0 && idx < n - 1) {\n        out[idx] = 0.25f * (in[idx - 1] + in[idx] + in[idx + 1]);\n    }\n}', source='User', type='UserMessage'), AssistantMessage(content='Memory-Bound\nTERMINATE\n', source='assistant', type='AssistantMessage')]


In [8]:
def chat_history_to_json_line(ctxMessages:list):
    jsonDict = {'messages':[]}
    for msg in ctxMessages:
        if type(msg) == SystemMessage:
            role = 'system'
        elif type(msg) == UserMessage:
            role = 'user'
        elif type(msg) == AssistantMessage:
            role = 'assistant'
        else:
            assert False, f'Unknown message type: {type(msg)} of {msg}'
        content = msg.content

        jsonDict['messages'].append({'role':role, 'content':content})

    return json.dumps(jsonDict, allow_nan=False)

In [9]:
# get the context of the AssistantAgent
chatHist = await agent._model_context.get_messages()
print(chatHist)
print()
print()
print(chat_history_to_json_line(chatHist))

[SystemMessage(content='You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.', type='SystemMessage'), UserMessage(content='__global__ void stencil(float* in, float* out, int n) {\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx > 0 && idx < n - 1) {\n        out[idx] = 0.25f * (in[idx - 1] + in[idx] + in[idx + 1]);\n    }\n}', source='User', type='UserMessage'), AssistantMessage(content='Memory-Bound\nTERMINATE\n', source='assistant', type='AssistantMessage')]


{"messages": [{"role": "system", "content": "You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other."}, {"role": "user", "content": "__glo

In [10]:

async def ask_llm_for_roofline_classification(modelName, systemMsg, cudaKernel, temp=1.0):
    model_client = OpenAIChatCompletionClient(
            model=modelName,
            base_url='https://openrouter.ai/api/v1',
            api_key=OPENROUTER_API_KEY,
            timeout=30,
            temperature=temp,
            model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}
    )
    
    sys_msg = SystemMessage(content=systemMsg)
    code_msg = UserMessage(source='User', content=cudaKernel)
    context = UnboundedChatCompletionContext(initial_messages=[sys_msg, code_msg])

    agent = AssistantAgent(
        name="assistant",
        model_client=model_client,
        model_context=context
    )

    await agent.run()
    return await agent._model_context.get_messages()

In [15]:
# let's load up the scraped CUDA kernels
jsonFile = '../scraped-cuda-kernels.json'
with open(jsonFile, "r") as fp:
    cudaKernels = json.load(fp)

totalKernels = 0
for target in cudaKernels:
    kernelNames = target['kernelNames']
    totalKernels += len(kernelNames)

print('Total scraped kernels:', totalKernels)


def write_output_file(filename, contents):
    with open(filename, 'w') as fp:
        fp.write(contents)

Total scraped kernels: 968


In [None]:
systemMessages = [
    'You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.',
    
    'You are a GPU performance analysis expert that classifies computational kernels into categories based on their source code characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.',
                  ]

#models = ['google/gemini-flash-1.5', 'google/gemini-pro']
models = ['google/gemini-flash-1.5']

#temps = [1.0, 0.8, 0.5, 0.2, 0.0]
temps = [0.2]

outputFile = 'llm-zero-shot-responses.jsonl'

In [None]:
gatheredData = ''

for idx, target in enumerate(cudaKernels):
    targetName = target['basename']
    kernelNames = target['kernelNames']
    kernels = target['kernels']

    if len(kernelNames) == 0:
        print(f'{targetName} has no found kernels -- skipping')
        continue

    # for now let's just stop early so we don't waste all our credits
    if idx > 2:
        break

    for kernel in kernelNames:
        kernelSrcCode = kernels[kernel]

        for sysMsg in systemMessages:
            for model in models:
                for temp in temps:
                    result = await ask_llm_for_roofline_classification(model, sysMsg, kernelSrcCode, temp)
                    jsonLResult = chat_history_to_json_line(result)
                    gatheredData += f'{jsonLResult}\n'
                    write_output_file(outputFile, gatheredData)
                


sheath-omp has no found kernels -- skipping


In [14]:
pprint(gatheredData)



('{"messages": [{"role": "system", "content": "You are a code analysis '
 'assistant that classifies computational kernels into categories based on '
 'their performance characteristics. Your task is to provide one of the '
 'following classifications: Compute-Bound, Memory-Bound, Balanced, or '
 'Other."}, {"role": "user", "content": "__global__ void\\nhaccmk_kernel '
 '(\\n    const int n1,  // outer loop count\\n    const int n2,  // inner '
 'loop count\\n    const float *__restrict__ xx, \\n    const float '
 '*__restrict__ yy,\\n    const float *__restrict__ zz,\\n    const float '
 '*__restrict__ mass,\\n          float *__restrict__ vx2,\\n          float '
 '*__restrict__ vy2,\\n          float *__restrict__ vz2,\\n    const float '
 'fsrmax,\\n    const float mp_rsm,\\n    const float fcoeff ) \\n{\\n  int i '
 '= blockIdx.x * blockDim.x + threadIdx.x;\\n  if (i >= n1) return;\\n\\n  '
 'const float ma0 = 0.269327f; \\n  const float ma1 = -0.0750978f; \\n  const '
 'float ma2