In [27]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt

### System Prompt Crafting

This notebook will showcase our attempts at crafting some system prompts and running them through the models we have access to through OpenRouter. Our primary objective is to see how current SoTA LLMs respond to our queries so that we can find out which system prompts give the most consistent responses back.

In [28]:
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import UserMessage, SystemMessage, AssistantMessage
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_agentchat.agents import AssistantAgent

In [29]:
# please create a file called '.openrouter-api-key' with your api key and no newline characters
with open('./.openrouter-api-key', 'r') as file:
    OPENROUTER_API_KEY=file.read()

In [30]:
model_client = OpenAIChatCompletionClient(
        model='google/gemini-flash-1.5',
        #'model' : 'google/gemini-pro',
        base_url='https://openrouter.ai/api/v1',
        api_key=OPENROUTER_API_KEY,
        timeout=30,
        temperature=1.0,
        model_info = {'vision':False, 'function_calling':True, 'json_output':True, 'model_family':'unknown'}
)

In [31]:

sys_msg = SystemMessage(content='You are a code analysis assistant that classifies computational kernels into categories based on their performance characteristics. Your task is to provide one of the following classifications: Compute-Bound, Memory-Bound, Balanced, or Other.')
code_msg = UserMessage(source='User', content='__global__ void stencil(float* in, float* out, int n) {\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx > 0 && idx < n - 1) {\n        out[idx] = 0.25f * (in[idx - 1] + in[idx] + in[idx + 1]);\n    }\n}')

context = UnboundedChatCompletionContext(initial_messages=[sys_msg, code_msg])





In [32]:
# we want to see if we can create a message history for the model to ingest
agent = AssistantAgent(
    name="assistant",
    model_client=model_client,
    model_context=context
)


In [33]:
result = await agent.run()

print(result)

TaskResult(messages=[TextMessage(source='assistant', models_usage=RequestUsage(prompt_tokens=163, completion_tokens=7), content='Memory-Bound\nTERMINATE\n', type='TextMessage')], stop_reason=None)


In [None]:
df = df[df['Kernel Name'].notna()] 

print(df.shape)

# let's also replace exeArgs that are NaN with ''
df['exeArgs'] = df['exeArgs'].fillna('')

#print(df[df['targetName'] == 'bspline-vgh-cuda'])
#df['exeArgs'] = df['exeArgs'].apply(lambda x: '' if x == 'NULL' else x )

#print(df.dtypes)

In [None]:
df['dpPerf'] = df['dpPerf']*(1e-11)
df['spPerf'] = df['spPerf']*(1e-11)

In [None]:
# it should be noted that samples with spAI=0 and spPerf=0 will not show on this plot
fig, ax = plt.subplots(1, figsize=(12,6))

sns.scatterplot(df, x='spAI', y='spPerf', ax=ax)

ax.set_yscale('log')
ax.set_xscale('log')

ax.set_title('Single-precision FLOP Roofline Data')

ax.set_xlabel('Arithmetic Intensity (FLOP/byte)')
ax.set_ylabel('Performance (1e11 FLOP/s)')

ax.set_ylim(1e-4,1e3)

plt.show()

In [None]:
# it should be noted that samples with dpAI=0 and dpPerf=0 will not show on this plot
fig, ax = plt.subplots(1, figsize=(12,6))

sns.scatterplot(df, x='dpAI', y='dpPerf', ax=ax)

ax.set_yscale('log')
ax.set_xscale('log')

ax.set_title('Double-precision FLOP Roofline Data')

ax.set_xlabel('Arithmetic Intensity (FLOP/byte)')
ax.set_ylabel('Performance (1e11 FLOP/s)')

plt.show()

In [None]:
# it should be noted that samples with dpAI=0 and dpPerf=0 will not show on this plot
fig, ax = plt.subplots(1, figsize=(12,6))

sns.scatterplot(df, x='spAI', y='spPerf', ax=ax)
sns.scatterplot(df, x='dpAI', y='dpPerf', ax=ax)

ax.set_yscale('log')
ax.set_xscale('log')

ax.set_title('NVIDIA RTX 3080 Roofline Data')

ax.set_xlabel('Arithmetic Intensity (FLOP/byte)')
ax.set_ylabel('Performance (1e11 FLOP/s)')

ax.legend(['Single-Precision FLOP', 'Double-Precision FLOP'])

plt.show()

In [None]:
# it should be noted that samples with dpAI=0 and dpPerf=0 will not show on this plot
fig, ax = plt.subplots(1, figsize=(12,6))

sns.scatterplot(df, x='spAI', y='spPerf', ax=ax)
sns.scatterplot(df, x='dpAI', y='dpPerf', ax=ax)


ax.axhline(249.6,color='red')
ax.axhline(3.9,color='green')

ax.axline(xy1=(0.51,3.9), xy2=(32.94, 249.6), color='blue')

#ax.plot([0.00001, 32.94], [0.00757, 249.6], color='blue')

ax.set_yscale('log')
ax.set_xscale('log')

#ax.axvline(0.51)
#ax.axvline(32.94)

ax.set_title('NVIDIA RTX 3080 Roofline Data', fontsize=20)

ax.set_xlabel('Arithmetic Intensity (FLOP/byte)', fontsize=16)
ax.set_ylabel('Performance (1e11 FLOP/s)', fontsize=16)

ax.legend(['Single-Precision FLOP', 'Double-Precision FLOP', 'Nsight SP Peak Perf.', 'Nsight DP Peak Perf.', 'Nsight Peak Bandwidth'])

plt.show()

![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

![image-3.png](attachment:image-3.png)

![image-4.png](attachment:image-4.png)

In [None]:
# how many different kernel invocations did we capture?
totalExes = df.groupby(['targetName', 'device', 'exeArgs']).ngroups
print(f'Total unique executables sampled: {totalExes}')

totalKernels = df.groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
print(f'Total unique kernel executions recorded: {totalKernels}')

zeroAIKernels = df[(df['spAI'] == 0.0) & (df['dpAI'] == 0.0)].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
print(f'Number of kernels with no arithmetic intensity (AI) {zeroAIKernels} ({round(100*zeroAIKernels/totalKernels, 2)}%)')

numSPKernels = df[df['spAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
numDPKernels = df[df['dpAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups

print(f'Number of non-zero AI single-precision kernels recorded: {numSPKernels} ({round(100*numSPKernels/totalKernels, 2)}%)')
print(f'Number of non-zero AI double-precision kernels recorded: {numDPKernels} ({round(100*numDPKernels/totalKernels, 2)}%)')

numSPKernels = df[df['spPerf'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups
numDPKernels = df[df['dpPerf'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs']).ngroups

print(f'Number of non-zero Perf single-precision kernels recorded: {numSPKernels} ({round(100*numSPKernels/totalKernels, 2)}%)')
print(f'Number of non-zero Perf double-precision kernels recorded: {numDPKernels} ({round(100*numDPKernels/totalKernels, 2)}%)')



In [None]:
spData = df[(df['spAI'] > 0)]
dpData = df[(df['dpAI'] > 0)]

#subdata = df[(df['spPerf'] > 0) | (df['dpPerf'] > 0) | (df['spAI'] > 0) | (df['dpAI'] > 0) ]
print('SP data')
print(spData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].max(axis=0))
print(spData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].min(axis=0))
print(spData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].median(axis=0))
print(spData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].mean(axis=0))

print()
print('DP data')
print(dpData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].max(axis=0))
print(dpData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].min(axis=0))
print(dpData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].median(axis=0))
print(dpData[['spPerf', 'dpPerf', 'spAI', 'dpAI']].mean(axis=0))

In [None]:
# what does the AI distribution look like? (violin plot + histogram)

fig, ax = plt.subplots(1, figsize=(12,6))

sns.histplot(df, x='spAI', bins=100, ax=ax, log_scale=True)
sns.histplot(df, x='dpAI', bins=100, ax=ax, log_scale=True)
ax2 = ax.twinx()
sns.ecdfplot(df, x='spAI', ax=ax2)
sns.ecdfplot(df, x='dpAI', ax=ax2)

ax.set_xlabel('Arithmetic Intensity (FLOP/byte)')
ax.set_ylabel('Number of Samples')
ax2.set_ylabel('Proportion of Total Samples')

ax.set_title('Non-zero Arithmetic Intensity (FLOP/byte) Samples Distribution')

ax.legend(['Single-Precision FLOP', 'Double-Precision FLOP'])

plt.show()

In [None]:
#subdf = df[df['targetName'].isin(['bspline-vgh-cuda'])].copy().reset_index()
#
#print(subdf)
#
#print(type(subdf.iloc[0]['exeArgs']))
#
#grpA = subdf[subdf['spAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size', 'exeArgs'])
#grpB = subdf[subdf['spAI'] > 0.0].groupby(['Kernel Name', 'kernelName', 'targetName', 'device', 'Block Size', 'Grid Size'])
#
#
#print(grpA)
#print(grpB)
#
#print(grpA.ngroups)
#print(grpB.ngroups)

In [None]:
# what does the AI distribution look like? (violin plot + histogram)

fig, ax = plt.subplots(1, figsize=(12,6))

sns.histplot(df, x='spPerf', bins=100, ax=ax, log_scale=True)
sns.histplot(df, x='dpPerf', bins=100, ax=ax, log_scale=True)
ax2 = ax.twinx()
sns.ecdfplot(df, x='spPerf', ax=ax2)
sns.ecdfplot(df, x='dpPerf', ax=ax2)

ax.set_xlabel('Performance (FLOP/s)')
ax.set_ylabel('Number of Samples')
ax2.set_ylabel('Proportion of Total Samples')

ax.set_title('Non-zero Performance (FLOP/s) Samples Distribution')

ax.legend(['Single-Precision FLOP', 'Double-Precision FLOP'])

plt.show()