# Eval for Codex

In [42]:
!pip install -r requirements.txt



In [43]:
import requests
import json
import pandas as pd
from dotenv import load_dotenv
import os
import ipywidgets as widgets
from IPython.display import display
from ipydatagrid import DataGrid
from itables import show
import importlib
import glob

In [91]:
import evaluation_prompts.eval_prompt2 as eval_prompt
importlib.reload(eval_prompt)
print(eval_prompt.evaluation_prompt)

You are an expert evaluator assessing how well an LLM response matches expected responses.

**EVALUATION DATA:**
[BEGIN DATA]
[Prompt]: {prompt}
[LLM Response]: {response}
[Expected Response 1]: {expected_response_1}
[Expected Response 2]: {expected_response_2}
[Expected Response 3]: {expected_response_3}
[END DATA]

**TASK:**
1. Compare the LLM response to each expected response and identify which ONE it most closely matches
2. Score the match quality using this 5-point scale:
   - **5 (Excellent):** Same core meaning, even if worded differently
   - **4 (Good):** Minor differences only (slight wording variations)
   - **3 (Partial):** Significant differences affecting clarity/completeness
   - **2 (Poor):** Some relation but fails to convey correct meaning
   - **1 (No Match):** No meaningful match in meaning/content/intent
3. Determine acceptance: scores 3-5 = "yes", scores 1-2 = "no"

**CRITICAL OUTPUT REQUIREMENTS:**
- You MUST return ONLY a JSON object with EXACTLY these 4 fields

In [45]:
import sys
sys.path.append('/path/to/eval/evaluation_prompts')
load_dotenv(override=True)
BASE_URL = "http://localhost:3100/api"
API_KEY = os.environ.get("OPENWEBUI_API_KEY", "")
headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}

## Getting all Models
Testing if we can fetch models from the API

In [46]:
try:
    response = requests.get(f"{BASE_URL}/models", headers=headers)
    response.raise_for_status()
    json_data = response.json()
    
    # Convert to DataFrame 
    if json_data and 'data' in json_data:
        models_df = pd.DataFrame(json_data['data'])
        print(models_df)
except requests.exceptions.RequestException as e:
    print(f"Error fetching models: {e}")

                                               id     created object owned_by  \
0                            amazon.nova-pro-v1:0  1751509746  model   openai   
1                           amazon.nova-lite-v1:0  1751509746  model   openai   
2                          amazon.nova-micro-v1:0  1751509746  model   openai   
3  apac.anthropic.claude-3-5-sonnet-20240620-v1:0  1751509746  model   openai   
4  apac.anthropic.claude-3-5-sonnet-20241022-v2:0  1751509746  model   openai   
5  apac.anthropic.claude-3-7-sonnet-20250219-v1:0  1751509746  model   openai   
6    apac.anthropic.claude-sonnet-4-20250514-v1:0  1751509746  model   openai   

  connection_type                                            name  \
0        external                            amazon.nova-pro-v1:0   
1        external                           amazon.nova-lite-v1:0   
2        external                          amazon.nova-micro-v1:0   
3        external  apac.anthropic.claude-3-5-sonnet-20240620-v1:0   
4     

Choose a model going ahead to the eval.

In [47]:
model_options = models_df['id'].tolist()

model_dropdown = widgets.Dropdown(
    options=model_options,
    value="apac.anthropic.claude-3-7-sonnet-20250219-v1:0",  # default selection
    description='Model:',
    disabled=False,
)

display(model_dropdown)

Dropdown(description='Model:', index=5, options=('amazon.nova-pro-v1:0', 'amazon.nova-lite-v1:0', 'amazon.nova…

In [48]:
model = model_dropdown.value
print("You selected:", model)

You selected: apac.anthropic.claude-3-7-sonnet-20250219-v1:0


## Getting all queries and Passing to LLM
Getting all prompts from data.csv file

In [49]:
data_dir = 'data'
data_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
print(data_files)

['data2.csv', 'data.csv']


In [50]:

dataset_dropdown = widgets.Dropdown(
    options=data_files,
    value="data2.csv",  # default selection
    description='Dataset:',
    disabled=False,
)
display(dataset_dropdown)

Dropdown(description='Dataset:', options=('data2.csv', 'data.csv'), value='data2.csv')

In [51]:
print(dataset_dropdown.value)


data2.csv


In [52]:

try: 
    file_path = os.path.join(data_dir, dataset_dropdown.value)
    prompts_df = pd.read_csv(file_path)
    prompt_expresponse_tuple = prompts_df.values.tolist()
except FileNotFoundError:
    print(f"Error: data/{dataset_dropdown.value} not found.")
    prompt_expresponse_tuple = []


In [53]:


# print(prompt_expresponse_tuple)
for prompt, expected_response1, expected_response2, expected_response3 in prompt_expresponse_tuple:
    print(prompt, expected_response1, expected_response2, expected_response3)

List all channels in the workspace Here are all the channels in the Slack workspace:

Main Company Channels:
#general (201 members) - Main company-wide channel for announcements and team conversations
#random (199 members) - Casual conversation and team jokes
#help-it (196 members) - IT related queries
#help-hr (191 members) - HR related queries
#help-legal-risk-compliance (97 members) - Legal/risk/compliance queries
Engineering & Tech Channels:
#all-devs (132 members) - All engineers company-wide
#plz-review (109 members) - Code review requests
#eng-backend (102 members) - Backend engineering discussions
#fe-gang (89 members) - Frontend engineering team
#guild-backend (33 members) - Backend guild
#guild-devx (132 members) - Developer experience guild
Product & Operations:
#product (76 members) - Product management discussions
#operations-service-model (4 members) - Operations organization
#prod-onboarding (69 members) - Production onboarding
#incident-response (97 members) - Incident 

Running all prompts with the selected bedrock model (default: anthropic.claude-3-5-sonnet-20241022-v2:0)

In [69]:

results = []
iterations = 3
output_dir = 'output'

for i in range(iterations):
    for prompt, expected_response1, expected_response2, expected_response3 in prompt_expresponse_tuple:
        payload = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "tool_ids": ["server:0", "server:1"],
            "temperature": 0.7,
            "stream": False  # Set to True if you want to stream the response
        }
        print("Payload: ", payload)

        response = requests.post(
            f"{BASE_URL}/chat/completions", 
            json=payload,
            headers=headers
        )

        result = response.json()
        print("Result: ", result)
        if (result["choices"]):
            print("Result message: \n")
            print(result["choices"][0]["message"]["content"])
            row_result = {
                "prompt": prompt,
                "model": result["model"],
                "response": result["choices"][0]["message"]["content"],
                "expected_response_1": expected_response1,
                "expected_response_2": expected_response2,
                "expected_response_3": expected_response3,
                "usage_prompt_tokens": result["usage"]["prompt_tokens"],
                "usage_completion_tokens": result["usage"]["completion_tokens"],
                "usage_total_tokens": result["usage"]["total_tokens"]
            }
            results.append(row_result)
        else:
            continue

    results_df = pd.DataFrame(results)
    print(results_df)

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'chat_results_'+str(i+1)+'.csv')
    results_df.to_csv(output_path, index=False)
    print(f"Chat results no. {i+1}:----------------\n {results_df.head(3)}\nPut into output/chat_results_{i+1}.csv\n\n")
    # grid = DataGrid(results_df, selection_mode="row", layout={"height": "400px"})
    # display(grid)



Payload:  {'model': 'apac.anthropic.claude-3-7-sonnet-20250219-v1:0', 'messages': [{'role': 'user', 'content': 'List all channels in the workspace'}], 'tool_ids': ['server:0', 'server:1'], 'temperature': 0.7, 'stream': False}
Result:  {'id': 'chatcmpl-bcd3b430', 'created': 1751517395, 'model': 'apac.anthropic.claude-3-7-sonnet-20250219-v1:0', 'system_fingerprint': 'fp', 'choices': [{'index': 0, 'finish_reason': 'length', 'logprobs': None, 'message': {'role': 'assistant', 'content': "I'll list all the channels in the workspace, organized by category to make it easier to navigate.\n\n# Slack Channels in the Workspace\n\n## Team & Squad Channels\n- #all-devs - All engineers of the company\n- #all-handsers\n- #backend-aws-resource-limit-alerts\n- #business-team - Discussion for business team\n- #client-solutions - Client solutions discussions\n- #client-success-team\n- #design - Design updates and feedback\n- #eng-backend - Engineering discussions for backend\n- #fe-gang - Frontend related

# Evaluating the Responses

## Selecting evaluation model

In [77]:
model_eval = "apac.anthropic.claude-sonnet-4-20250514-v1:0"
print("You selected:", model_eval)

You selected: apac.anthropic.claude-sonnet-4-20250514-v1:0


In [71]:
# importlib.reload(eval_prompt) 
print(eval_prompt.evaluation_prompt)


You are an expert evaluator tasked with assessing how well an LLM response matches one of several expected correct responses.

**EVALUATION DATA:**
[BEGIN DATA]
[Prompt]: {prompt}
[LLM Response]: {response}
[Expected Response 1]: {expected_response_1}
[Expected Response 2]: {expected_response_2}
[Expected Response 3]: {expected_response_3}
[END DATA]

**EVALUATION CRITERIA:**
You must determine which ONE expected response the LLM response most closely matches, then score the quality of that match using this 5-point rubric:

• **Score 5 (Excellent Match):** The response conveys the same core meaning as the selected expected response, even if worded differently
• **Score 4 (Good Match):** The response matches the selected expected response with only minor differences (e.g., slight wording variations, additional context that doesn't change the meaning)
• **Score 3 (Partial Match):** The response partially matches the selected expected response but has significant differences that affect c

## Getting all prompts, responses and expected responses.

In [78]:
output_dir = 'eval_output'

# Get a list of all chat_results_*.csv files
files = glob.glob('output/chat_results_*.csv')
eval_iterations = len(files)
print(f"Found {eval_iterations} chat results files")

Found 3 chat results files


In [92]:
print(eval_prompt.evaluation_prompt)

You are an expert evaluator assessing how well an LLM response matches expected responses.

**EVALUATION DATA:**
[BEGIN DATA]
[Prompt]: {prompt}
[LLM Response]: {response}
[Expected Response 1]: {expected_response_1}
[Expected Response 2]: {expected_response_2}
[Expected Response 3]: {expected_response_3}
[END DATA]

**TASK:**
1. Compare the LLM response to each expected response and identify which ONE it most closely matches
2. Score the match quality using this 5-point scale:
   - **5 (Excellent):** Same core meaning, even if worded differently
   - **4 (Good):** Minor differences only (slight wording variations)
   - **3 (Partial):** Significant differences affecting clarity/completeness
   - **2 (Poor):** Some relation but fails to convey correct meaning
   - **1 (No Match):** No meaningful match in meaning/content/intent
3. Determine acceptance: scores 3-5 = "yes", scores 1-2 = "no"

**CRITICAL OUTPUT REQUIREMENTS:**
- You MUST return ONLY a JSON object with EXACTLY these 4 fields

In [93]:


for i in range(eval_iterations):
    all_eval_results = []

    try:
        data = pd.read_csv('output/chat_results_'+str(i+1)+'.csv')
        prompts_expected_responses = data[['prompt','response', 'expected_response_1', 'expected_response_2', 'expected_response_3']].values.tolist()
    except FileNotFoundError:
        print(f"Error: output/chat_results_{i+1}.csv not found.")
        break

    print(f"Chat results no. {i+1}:----------------\n {prompts_expected_responses}\n")

    for prompt, response, expected_response1, expected_response2, expected_response3 in prompts_expected_responses:
        # print(comparision_eval_prompt)
        inuse_eval_prompt = eval_prompt.evaluation_prompt.format(prompt=prompt, response=response, expected_response_1=expected_response1, expected_response_2=expected_response2, expected_response_3=expected_response3)
        # print(inuse_eval_prompt)

        payload = {
            "model": model_eval,
            "messages": [
                {"role": "user", "content": inuse_eval_prompt}
            ],
            "temperature": 0.7,
            "stream": False  # Set to True if you want to stream the response
        }

        result = requests.post(
            f"{BASE_URL}/chat/completions",
            json=payload,
            headers=headers
        ).json()
        # print(f"Evaluation results for prompt {prompt}: {result}")

        print(result)

        if result:
            if result["choices"]:
                eval_result = result["choices"][0]["message"]["content"]
                print(eval_result)
                eval_result_json = json.loads(eval_result)
                print(f"Evaluation results for prompt {prompt}: \n")
                print(eval_result_json)

                row_result = {
                    "prompt": prompt,
                    "model": result["model"],
                    "response": response,
                    "selected_expected_response": eval_result_json["selected_expected_response"],
                    "score": eval_result_json["score"],
                    "answer": eval_result_json["answer"],
                    "reasoning": eval_result_json["reasoning"]
                }
                all_eval_results.append(row_result)
            else:
                continue
        else:
            continue

    all_eval_results_df = pd.DataFrame(all_eval_results)
    # print(all_eval_results_df)

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'eval_results_'+str(i+1)+'.csv')
    all_eval_results_df.to_csv(output_path, index=False)
    print(f"Evaluation results for chat results no. {i+1}:----------------\n {all_eval_results_df.head(3)}\nSaved to {output_path}\n\n")



Chat results no. 1:----------------
 [['List all channels in the workspace', "I'll list all the channels in the workspace, organized by category to make it easier to navigate.\n\n# Slack Channels in the Workspace\n\n## Team & Squad Channels\n- #all-devs - All engineers of the company\n- #all-handsers\n- #backend-aws-resource-limit-alerts\n- #business-team - Discussion for business team\n- #client-solutions - Client solutions discussions\n- #client-success-team\n- #design - Design updates and feedback\n- #eng-backend - Engineering discussions for backend\n- #fe-gang - Frontend related issues\n- #finance - Finance workstream discussions\n- #frontend-gang - Frontend team channel\n- #guild-backend - Backend guild\n- #guild-devx - Developer experience guild\n- #guild-frontend - Frontend guild\n- #product - Product management team\n- #product-operations - Product operations\n- #security - Security questions and discussions\n- #squad-data - Data team requests\n- #squad-embedded - Embedded squ

KeyError: 'choices'

## Retrieve All the Results and Average them out

In [None]:
# Get all CSV files in eval_output directory
csv_files = glob.glob('eval_output/eval_results_*.csv')
print(f"Found {len(csv_files)} CSV files: {csv_files}")

# Read all CSV files into a list of DataFrames
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    print(f"Loaded {file} with {len(df)} rows")

print(dfs)


Found 3 CSV files: ['eval_output/eval_results_2.csv', 'eval_output/eval_results_3.csv', 'eval_output/eval_results_1.csv']
Loaded eval_output/eval_results_2.csv with 20 rows
Loaded eval_output/eval_results_3.csv with 20 rows
Loaded eval_output/eval_results_1.csv with 20 rows
[                                               prompt  \
0                  List all channels in the workspace   
1                         Show me all public channels   
2   Post 'Hello team! From eval framework here.' t...   
3   Add heart reaction to message at timestamp 175...   
4   Reply to thread 1750300317.785949 in channel C...   
5   Get all replies from thread 1750300317.785949 ...   
6              Show me all the users in the workspace   
7            Get profile details for user U08FUTB4QLC   
8         Search for messages containing 'mcp server'   
9            Get recent messages from the ytg channel   
10  Post 'Meeting in 5 minutes - from eval framewo...   
11  Add thumbsup to message 1751349675.2

In [6]:

# Concatenate all DataFrames
all_data = pd.concat(dfs, ignore_index=True)
print(f"Combined data has {len(all_data)} rows")
print(all_data.head())


Combined data has 60 rows
                                              prompt  \
0                 List all channels in the workspace   
1                        Show me all public channels   
2  Post 'Hello team! From eval framework here.' t...   
3  Add heart reaction to message at timestamp 175...   
4  Reply to thread 1750300317.785949 in channel C...   

                                            model  \
0  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   
1  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   
2  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   
3  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   
4  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   

                                            response  \
0  Here's a list of all the channels in the works...   
1  I've retrieved a list of all public channels i...   
2  I've posted the message "Hello team! From eval...   
3  I tried to add a heart reaction to the message...   
4  I've replied "Thanks for the update

In [7]:
print(all_data.columns)

Index(['prompt', 'model', 'response', 'selected_expected_response', 'score',
       'answer', 'reasoning'],
      dtype='object')


In [8]:
# Group by prompt and response, then calculate average scores
grouped = all_data.groupby(['prompt', 'response']).agg({
    'score': 'mean',
}).round(2)
print(grouped)


                                                                                                       score
prompt                                             response                                                 
Add heart reaction to message at timestamp 1750... I tried to add a heart reaction to the message ...   5.00
Add thumbsup to message 1751349675.278699 in ai... I apologize, but I was unable to add the thumbs...   3.00
Get all replies from thread 1750300317.785949 i... # Thread Replies in Channel C08UFS9HMCN\n\nI've...   3.33
Get me recent messages from the trump-supporter... I'm sorry, but I couldn't retrieve messages fro...   4.00
Get profile details for John Smith                 I was not able to find profile details for "Joh...   4.33
Get profile details for Mohitha Mohan              # Mohitha Mohan's Profile Details\n\n**Basic In...   3.00
Get profile details for user U08FUTB4QLC           # User Profile: Mohitha Mohan\n\n## Basic Infor...   4.00
Get recent messages

In [9]:
# Reset index to turn grouped columns back into regular columns
final_results = grouped.reset_index()


print(f"Final results have {len(final_results)} unique prompt-response pairs")
print("\nFirst few rows:")
print(final_results.head())

Final results have 20 unique prompt-response pairs

First few rows:
                                              prompt  \
0  Add heart reaction to message at timestamp 175...   
1  Add thumbsup to message 1751349675.278699 in a...   
2  Get all replies from thread 1750300317.785949 ...   
3  Get me recent messages from the trump-supporte...   
4                 Get profile details for John Smith   

                                            response  score  
0  I tried to add a heart reaction to the message...   5.00  
1  I apologize, but I was unable to add the thumb...   3.00  
2  # Thread Replies in Channel C08UFS9HMCN\n\nI'v...   3.33  
3  I'm sorry, but I couldn't retrieve messages fr...   4.00  
4  I was not able to find profile details for "Jo...   4.33  


In [None]:
# Save to final_eval_results.csv
output_path = 'eval_output/final_eval_results.csv'
final_results.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")

# Display the results (if using Jupyter)
display(final_results) 


Results saved to eval_output/final_eval_results.csv


Unnamed: 0,prompt,response,score
0,Add heart reaction to message at timestamp 175...,I tried to add a heart reaction to the message...,5.0
1,Add thumbsup to message 1751349675.278699 in a...,"I apologize, but I was unable to add the thumb...",3.0
2,Get all replies from thread 1750300317.785949 ...,# Thread Replies in Channel C08UFS9HMCN\n\nI'v...,3.33
3,Get me recent messages from the trump-supporte...,"I'm sorry, but I couldn't retrieve messages fr...",4.0
4,Get profile details for John Smith,"I was not able to find profile details for ""Jo...",4.33
5,Get profile details for Mohitha Mohan,# Mohitha Mohan's Profile Details\n\n**Basic I...,3.0
6,Get profile details for user U08FUTB4QLC,# User Profile: Mohitha Mohan\n\n## Basic Info...,4.0
7,Get recent messages from the ytg channel,"I'll get the recent messages from the ""ytg"" ch...",2.0
8,List all channels in the workspace,Here's a list of all the channels in the works...,4.0
9,Post 'Hello team! From eval framework here.' t...,"I've posted the message ""Hello team! From eval...",4.0


In [11]:
grid = DataGrid(final_results, selection_mode="row", layout={"height": "400px"})
display(grid)

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…