### System Prompt Crafting

This notebook will showcase our attempts at crafting some system prompts and running them through the models we have access to through OpenRouter. Our primary objective is to see how current SoTA LLMs respond to our queries so that we can find out which system prompts give the most consistent responses back.

In [1]:
from autogen_ext.models.openai import AzureOpenAIChatCompletionClient
from autogen_core.models import UserMessage, SystemMessage, AssistantMessage
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_agentchat.agents import AssistantAgent
import json
from pprint import pprint
import time

In [2]:
# please create a file called '.openrouter-api-key' with your api key and no newline characters
with open('./.llm-api-key', 'r') as file:
    LLM_API_KEY=file.read()

In [3]:
def chat_history_to_json_line(ctxMessages:list):
    jsonDict = {'messages':[]}
    for msg in ctxMessages:
        if type(msg) == SystemMessage:
            role = 'system'
        elif type(msg) == UserMessage:
            role = 'user'
        elif type(msg) == AssistantMessage:
            role = 'assistant'
        else:
            assert False, f'Unknown message type: {type(msg)} of {msg}'
        content = msg.content

        jsonDict['messages'].append({'role':role, 'content':content})

    return json.dumps(jsonDict, allow_nan=False)

In [4]:

async def ask_llm_for_roofline_classification(modelName, systemMsg, cudaKernel, temp=1.0):
    model_client = AzureOpenAIChatCompletionClient(
            model='gpt-4o-mini',
            azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
            azure_deployment='gpt-4o-mini',
            api_key=LLM_API_KEY,
            timeout=60,
            temperature=0.1,
            api_version='2024-08-01-preview',
    )
            #model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}

    #print(f'LLM API Key [{LLM_API_KEY}]')
    
    sys_msg = SystemMessage(content=systemMsg)
    code_msg = UserMessage(source='User', content=f'```{cudaKernel}```')
    context = UnboundedChatCompletionContext(initial_messages=[sys_msg, code_msg])

    agent = AssistantAgent(
        name="assistant",
        model_client=model_client,
        model_context=context
    )

    await agent.run()
    return await agent._model_context.get_messages()

In [None]:
'''
from autogen_ext.models.openai import AzureOpenAIChatCompletionClient
from autogen_core.models import UserMessage, SystemMessage
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_agentchat.agents import AssistantAgent

model_client = AzureOpenAIChatCompletionClient(
        model='gpt-4o-mini',
        azure_endpoint='https://galor-m6d0ej1n-eastus2.cognitiveservices.azure.com',
        azure_deployment='gpt-4o-mini',
        api_key=LLM_API_KEY,
        timeout=60,
        temperature=0.1,
        api_version='2024-08-01-preview',
)
sys_msg = SystemMessage(content='You are a robot and your purpose is to go `beep` and `boop` ONLY.')
code_msg = UserMessage(source='User', content=f'```THIS IS A TEST MESSAGE, PLEASE IGNORE!```')
context = UnboundedChatCompletionContext(initial_messages=[sys_msg, code_msg])

agent = AssistantAgent(
    name="assistant",
    model_client=model_client,
    model_context=context
)

await agent.run()
result = await agent._model_context.get_messages()
print(result)
'''

In [5]:
# let's load up the scraped CUDA kernels
jsonFile = './simple-scraped-kernels-CUDA-pruned.json'
with open(jsonFile, "r") as fp:
    cudaKernels = json.load(fp)

totalKernels = 0
for target in cudaKernels:
    kernelNames = target['kernelNames']
    totalKernels += len(kernelNames)

print('Total scraped kernels:', totalKernels)


def write_output_file(filename, contents):
    with open(filename, 'w') as fp:
        fp.write(contents)

Total scraped kernels: 607


In [6]:
systemMessages = [
#    'You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.',
    
#    'You are a GPU performance analysis expert that classifies computational kernels into categories based on their source code characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.',


    '''You are a GPU performance analysis expert that classifies computational kernels into categories based on their source code characteristics. Your task is to provide one of the following performance boundedness classifications: Compute, Bandwidth, Balanced.  A kernel is considered Compute bound if its performance is primarily limited by the number of operations it performs, Bandwidth bound if its performance is primarily limited by the rate at which data can be moved between memory and processing units, and Balanced if the performance is limited roughly equally by compute and memory access.

Provide only one word as your response, chosen from the set: ['Compute', 'Bandwidth', 'Balanced'].
**Examples:**
**Example 1:**
```
Kernel Source Code (simplified):
for i = 0 to 1000000 {
  a[i] = a[i] + b[i];
}
```
Response: Compute

**Example 2:**
```
Kernel Source Code (simplified):
for i = 0 to 10 {
  load_data(large_array);   //loads from large memory
  process_data(large_array); //processes data
  store_data(large_array);  //stores back to memory
}
```
Response: Bandwidth

**Example 3:**
```
Kernel Source Code (simplified):
for i = 0 to 1000 {
  vector_add(a,b,c);   //process data in situ
}
//Some smaller data movement but mostly compute.
```
Response: Compute

Now, analyze the following kernel:
'''
                  ]

#models = ['google/gemini-flash-1.5', 'google/gemini-pro']
#models = ['google/gemini-flash-1.5']
models = ['gpt-4o-mini']

#temps = [1.0, 0.8, 0.5, 0.2, 0.0]
#temps = [0.1, 0.2, 0.6, 1.2]
temps = [0.1]

outputFile = 'llm-zero-shot-responses.jsonl'

In [7]:
gatheredData = ''

for idx, target in enumerate(cudaKernels):
    targetName = target['basename']
    kernelNames = target['kernelNames']
    kernels = target['kernels']

    if len(kernelNames) == 0:
        print(f'{targetName} has no found kernels -- skipping')
        continue

    # for now let's just stop early so we don't waste all our credits
    if idx > 1:
        break

    for kernel in kernelNames:
        kernelSrcCode = kernels[kernel]

        for sysMsg in systemMessages:
            for model in models:
                for temp in temps:
                    # wait 1 second between invocations, we don't want to get cloudflare banned, again...
                    time.sleep(1.0)
                    result = await ask_llm_for_roofline_classification(model, sysMsg, kernelSrcCode, temp)
                    jsonLResult = chat_history_to_json_line(result)
                    gatheredData += f'{jsonLResult}\n'
                    write_output_file(outputFile, gatheredData)
                


NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}}

In [8]:
gatheredDataJSONL = []
with open(outputFile, 'r') as fp:
    samples = fp.readlines()
    for sample in samples:
        sampleData = json.loads(sample)
        gatheredDataJSONL.append(sampleData)


In [9]:
# check that all the responses are of the desired string
for sample in gatheredDataJSONL:
    response = sample['messages'][2]['content']
    assert response in ['Compute', 'Bandwidth', 'Balanced']

AssertionError: 